In this notebook we are going to see if the intuition found in the paper, that when finetuning BERT the cosine similarity is close to one, while GPT2 is not that much, allowing GPT2 to perform better than BERT in the bot classification task.

In [35]:
from transformers import GPT2Tokenizer, GPT2Model, BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load dataset

In [2]:
import pandas as pd

directory = '../data/bot_detection/'
test = pd.read_csv(directory + "test.csv", header=None)

test = pd.DataFrame({
    'id':range(len(test)),
    'label':test[0],
    'mark':['a']*test.shape[0],
    'text': test[1].replace(r'\n', ' ', regex=True)
})

test.columns =  ["index", "label", "mark", "tweet"]

# Preprocessing

In [29]:
model_name = "BERT"

if model_name == "BERT":
    test_sentences = test.tweet.values
    test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]
    test_labels = test.label.values

elif model_name == "gpt2":
    test_sentences = test.tweet.values
    test_labels = test.label.values

In [30]:
max_length = 128
if model_name == "BERT":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "gpt2":
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)

test_input_ids = [tokenizer.encode(sent) for sent in test_sentences]

In [31]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 128
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [32]:
test_inputs = torch.tensor(test_input_ids).cuda()

### Create the generators

In [33]:
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)
batch_size=200
test_data = TensorDataset(test_inputs)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

### Model and parameters

In [47]:
if model_name == "BERT":
    #model = BertModel.from_pretrained('bert-base-uncased')
    model = torch.load("../models/BERT_Classifier_Large.pt")
    model = model.cuda()
    model = model.bert
    model.config.output_hidden_states = True
    model.config.is_decoder = False
    model.encoder.output_hidden_states = True
    for i in range(0,len(model.encoder.layer)): 
        model.encoder.layer[i].is_decoder = False
        model.encoder.layer[i].output_hidden_states = True
elif model_name == "gpt2":
    #model = GPT2Model.from_pretrained("gpt2")
    model = torch.load("../models/Gpt2_Classifier_Large.pt")
    model = model.cuda()
    model = model.transformer
    model.output_hidden_states = True

### Contextual similarity

In [48]:
mean_similarities = [[] for i in range(0,13)]
with torch.no_grad():
    for step, batch in enumerate(test_dataloader):
        test_inputs = batch[0]
        outputs = model(test_inputs)
        hidden_states = outputs[2]  # The last hidden-state is the first element of the output tuple
        for i, hidden_state in enumerate(hidden_states):
            for j in range(0,batch_size):
                hidden_state_np = hidden_state[j].cpu().numpy()
                contextual_similarity = cosine_similarity(hidden_state_np,hidden_state_np)
                mean_similarities[i].append(np.mean(contextual_similarity))
        print(step,"/",len(test_dataloader))

0 / 500
1 / 500
2 / 500
3 / 500
4 / 500
5 / 500
6 / 500
7 / 500
8 / 500
9 / 500
10 / 500
11 / 500
12 / 500
13 / 500
14 / 500
15 / 500
16 / 500
17 / 500
18 / 500
19 / 500
20 / 500
21 / 500
22 / 500
23 / 500
24 / 500
25 / 500
26 / 500
27 / 500
28 / 500
29 / 500
30 / 500
31 / 500
32 / 500
33 / 500
34 / 500
35 / 500
36 / 500
37 / 500
38 / 500
39 / 500
40 / 500
41 / 500
42 / 500
43 / 500
44 / 500
45 / 500
46 / 500
47 / 500
48 / 500
49 / 500
50 / 500
51 / 500
52 / 500
53 / 500
54 / 500
55 / 500
56 / 500
57 / 500
58 / 500
59 / 500
60 / 500
61 / 500
62 / 500
63 / 500
64 / 500
65 / 500
66 / 500
67 / 500
68 / 500
69 / 500
70 / 500
71 / 500
72 / 500
73 / 500
74 / 500
75 / 500
76 / 500
77 / 500
78 / 500
79 / 500
80 / 500
81 / 500
82 / 500
83 / 500
84 / 500
85 / 500
86 / 500
87 / 500
88 / 500
89 / 500
90 / 500
91 / 500
92 / 500
93 / 500
94 / 500
95 / 500
96 / 500
97 / 500
98 / 500
99 / 500
100 / 500
101 / 500
102 / 500
103 / 500
104 / 500
105 / 500
106 / 500
107 / 500
108 / 500
109 / 500
110 / 500


In [49]:
mean_similarities_np = np.array(mean_similarities)

In [50]:
mean_similarities_np

array([[0.29659212, 0.4286276 , 0.55409384, ..., 0.2627282 , 0.23423457,
        0.31067103],
       [0.54460746, 0.6707187 , 0.7756945 , ..., 0.49183565, 0.45353505,
        0.5507088 ],
       [0.54024374, 0.6595992 , 0.75318027, ..., 0.49116823, 0.45554706,
        0.53942394],
       ...,
       [0.56883013, 0.6455235 , 0.74219525, ..., 0.5620757 , 0.46991804,
        0.5875374 ],
       [0.66438204, 0.7242289 , 0.8074277 , ..., 0.6553541 , 0.5167835 ,
        0.6864211 ],
       [0.9933552 , 0.9927799 , 0.9973526 , ..., 0.98054695, 0.76677585,
        0.8873472 ]], dtype=float32)

In [51]:
np.mean(mean_similarities_np,axis=1)

array([0.38357255, 0.6140497 , 0.6038382 , 0.5596043 , 0.5631125 ,
       0.5586654 , 0.5246606 , 0.5254687 , 0.5379017 , 0.56307393,
       0.6253561 , 0.71307486, 0.96989065], dtype=float32)