In [1]:
from tqdm import trange
from transformers import GPT2DoubleHeadsModel
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, BertTokenizer
import numpy as np
from torch.nn import CrossEntropyLoss
from transformers import AdamW

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Dataset

In [3]:
import pandas as pd

directory = "../data/bot_detection"
train = pd.read_csv(directory + "train.csv", header=None)
test = pd.read_csv(directory + "test.csv", header=None)

train = pd.DataFrame({
    'id':range(len(train)),
    'label':train[0],
    'mark':['a']*train.shape[0],
    'text': train[1].replace(r'\n', ' ', regex=True)
})

test = pd.DataFrame({
    'id':range(len(test)),
    'label':test[0],
    'mark':['a']*test.shape[0],
    'text': test[1].replace(r'\n', ' ', regex=True)
})

train.columns = ["index", "label", "mark", "tweet"]
test.columns =  ["index", "label", "mark", "tweet"]

## Preprocessing

In [4]:
model_name = "gpt2"

if model_name == "BERT":
    train_sentences = train.tweet.values
    train_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in train_sentences]
    train_labels = train.label.values

    test_sentences = test.tweet.values
    test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]
    test_labels = test.label.values

elif model_name == "gpt2":
    train_sentences = train.tweet.values
    train_labels = train.label.values
    test_sentences = test.tweet.values
    test_labels = test.label.values

In [5]:
max_length = 128
if model_name == "BERT":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "gpt2":
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    
train_input_ids = [tokenizer.encode(sent) for sent in train_sentences]
test_input_ids = [tokenizer.encode(sent) for sent in test_sentences]

In [6]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 128
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

Using TensorFlow backend.


In [7]:
train_inputs = torch.tensor(train_input_ids)
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)

### Create the generators

In [8]:
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)
batch_size = 8
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

### Model and parameters

In [9]:
if model_name == "BERT":
    model = torch.load('../models/BERT_Classifier_Large.pt')
elif model_name == "gpt2":
    model = torch.load("../models/Gpt2_Classifier_Large.pt")

In [10]:
linears = [nn.Linear(768, 2).to(torch.device("cuda:0")) for  x in range(0,12)]

In [11]:
param_optimizer = [list(linears[i].named_parameters()) for i in range(0,12)]
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [[
    {'params': [p for n, p in param_optimizer[i] if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer[i] if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
] for i in range(0,12)]

optimizer = [AdamW(optimizer_grouped_parameters[i],
                     lr=2e-5,) for i in range(0,12)]

## Training

In [12]:
if model_name=="BERT":
  model.bert.config.output_hidden_states = True
  model.bert.config.is_decoder = False
  model.bert.encoder.output_hidden_states = True
  for i in range(0,len(model.bert.encoder.layer)): 
    model.bert.encoder.layer[i].is_decoder = False
    model.bert.encoder.layer[i].output_hidden_states = True
else:
    model.transformer.output_hidden_states = True

Multiple layers training:

In [None]:
# Store our loss and accuracy for plotting
train_loss_sets = [[] for i in range(0,12)]

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  for linear in linears:
    linear.train()
    
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_labels = batch
    for i in range(0,12):
      optimizer[i].zero_grad()
    if(model_name=="gpt2"):
        #model.transformer.output_hidden_states = True
        outputs = model.transformer(b_input_ids)
        logits = []
        for i in range(0,12):
            hl = outputs[2][i+1] ## We taken the all hidden states and take the l layer
            #h0 = outputs[3][0] ## Here we are using embeddings layer 
            #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
            logits.append(linears[i](hl[:,-1])) ## We take the last token embedding to train the linear layer
    else:
        #h0 = model.bert.embeddings(b_input_ids)
        #logits = linear(h0.view(-1,98304))
        outputs = model.bert(b_input_ids)
        logits = []
        for i in range(0,12):
            hl = outputs[2][i+1] ## We taken the all hidden states and take the l layer
            #h0 = outputs[3][0] ## Here we are using embeddings layer 
            #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
            logits.append(linears[i](hl[:,0])) ## We take the first token [CLS] embedding to train the linear layer

    for i in range(0,12): 
      loss_fct = CrossEntropyLoss()
      loss=loss_fct(logits[i].view(-1, logits[i].size(-1)),
                              b_labels.view(-1))
      
      train_loss_sets[i].append(loss.item())
      loss.backward(retain_graph=True)
      optimizer[i].step()
    
#if model == "gpt2":
#  torch.save(linear, "/content/drive/My Drive/linear_gpt2_layer1.pt")
#else:
#  torch.save(linear,"/content/drive/My Drive/linear_BERT.pt")

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

## Evaluation.

In [15]:
preds = [[] for i in range(0,12)]

for i in range(int(len(test_inputs)/10)):
    batch = (test_inputs[i*10: (i+1)*10].to(device))
    with torch.no_grad():
        if(model_name=="gpt2"):
            outputs = model.transformer(batch)
            logits = []
            for j in range(0,12):
                hl = outputs[2][j+1] ## We taken the all hidden states and take the l layer
                #h0 = outputs[3][0] ## Here we are using embeddings layer 
                #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
                logits.append(linears[j](hl[:,-1])) ## We take the last token embedding to train the linear layer
        else:
            outputs = outputs = model.bert(batch)
            logits = []
            for j in range(0,12):
                hl = outputs[2][j+1] ## We taken the all hidden states and take the l layer
                #h0 = outputs[3][0] ## Here we are using embeddings layer 
                #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
                logits.append(linears[j](hl[:,0])) ## We take the first token [CLS] embedding to train the linear layer
    for j in range(0,12):
        logits[j] = logits[j].detach().cpu().numpy()
        preds[j].append(logits[j])
    if i%100 == 0:
        print("Processing: ", i*10/1000, "%")

Processing:  0.0 %
Processing:  1.0 %
Processing:  2.0 %
Processing:  3.0 %
Processing:  4.0 %
Processing:  5.0 %
Processing:  6.0 %
Processing:  7.0 %
Processing:  8.0 %
Processing:  9.0 %
Processing:  10.0 %
Processing:  11.0 %
Processing:  12.0 %
Processing:  13.0 %
Processing:  14.0 %
Processing:  15.0 %
Processing:  16.0 %
Processing:  17.0 %
Processing:  18.0 %
Processing:  19.0 %
Processing:  20.0 %
Processing:  21.0 %
Processing:  22.0 %
Processing:  23.0 %
Processing:  24.0 %
Processing:  25.0 %
Processing:  26.0 %
Processing:  27.0 %
Processing:  28.0 %
Processing:  29.0 %
Processing:  30.0 %
Processing:  31.0 %
Processing:  32.0 %
Processing:  33.0 %
Processing:  34.0 %
Processing:  35.0 %
Processing:  36.0 %
Processing:  37.0 %
Processing:  38.0 %
Processing:  39.0 %
Processing:  40.0 %
Processing:  41.0 %
Processing:  42.0 %
Processing:  43.0 %
Processing:  44.0 %
Processing:  45.0 %
Processing:  46.0 %
Processing:  47.0 %
Processing:  48.0 %
Processing:  49.0 %
Processing

In [16]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
for i in range(0,12):
    print("layer " + str(i+1))
    predictions = []
    for pred in preds[i]:
        p = np.argmax(pred, axis = 1)
        for label in p:
            predictions.append(label)
    print(classification_report(np.asarray(test["label"][:len(predictions)]), predictions, digits = 4))
    print("Accuracy: ", accuracy_score(np.asarray(test["label"][:len(predictions)]), predictions))

layer 1
             precision    recall  f1-score   support

          0     0.6965    0.7777    0.7349     55712
          1     0.6723    0.5737    0.6191     44288

avg / total     0.6858    0.6874    0.6836    100000

Accuracy:  0.68737
layer 2
             precision    recall  f1-score   support

          0     0.7277    0.8014    0.7627     55712
          1     0.7136    0.6227    0.6651     44288

avg / total     0.7215    0.7222    0.7195    100000

Accuracy:  0.72225
layer 3
             precision    recall  f1-score   support

          0     0.7252    0.8233    0.7711     55712
          1     0.7321    0.6075    0.6640     44288

avg / total     0.7282    0.7277    0.7237    100000

Accuracy:  0.72772
layer 4
             precision    recall  f1-score   support

          0     0.7273    0.8301    0.7753     55712
          1     0.7401    0.6084    0.6678     44288

avg / total     0.7329    0.7319    0.7277    100000

Accuracy:  0.73193
layer 5
             precision  