In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, GPT2Tokenizer, GPT2DoubleHeadsModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset, SequentialSampler)
from tqdm import trange

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
directory = "../data/SST2/"

In [3]:
train = pd.read_csv(directory+"train.tsv", sep = "\t")
#test = pd.read_csv(directory+"test.tsv", sep = "\t")
dev = pd.read_csv(directory+"dev.tsv", sep = "\t")

In [4]:
model_name = "gpt2"

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
if model_name == "BERT":
    train_sentences = train['sentence'].values
    train_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in train_sentences]
    train_labels = train['label'].values

    test_sentences = dev['sentence'].values
    test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]
    test_labels = dev['label'].values
    
elif model_name == "gpt2":
    train_sentences = train['sentence'].values
    #train_sentences = ["_start_ " + sentence + " _classify_" for sentence in train_sentences]
    train_labels = train['label'].values
    
    test_sentences = dev['sentence'].values
    #test_sentences = ["_start_ " + sentence + " _classify_" for sentence in test_sentences]
    test_labels = dev['label'].values

In [7]:
max_length = 128
if model_name == "BERT":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "gpt2":
    special_tokens = ["_start_", "_classify_"]
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    tokenizer.add_tokens(special_tokens)

In [None]:
train_input_ids = [tokenizer.encode(sent, add_special_tokens = False) for sent in train_sentences]
test_input_ids = [tokenizer.encode(sent, add_special_tokens = False) for sent in test_sentences]

In [9]:
start = tokenizer.convert_tokens_to_ids("_start_")
classify = tokenizer.convert_tokens_to_ids("_classify_")
train_input_ids = [[start] + tokenizer.encode(sent)[:(max_length-2)] + [classify] for sent in train_sentences]
test_input_ids = [[start] + tokenizer.encode(sent)[:(max_length-2)] + [classify] for sent in test_sentences]

In [10]:
MAX_LEN = 128
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [11]:
train_attention_masks = []
test_attention_masks = []

for seq in train_input_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)
    
for seq in test_input_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)

In [12]:
cls_position_train = np.argmax(train_input_ids==classify,axis=1)
cls_position_test = np.argmax(test_input_ids==classify,axis=1)

In [13]:
train_inputs = torch.tensor(train_input_ids)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)
train_cls_positions= torch.tensor(cls_position_train)

test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)
test_cls_positions=torch.tensor(cls_position_test)

In [20]:
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_cls_positions)
#train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels, test_cls_positions)
#test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [15]:
if model_name == "BERT":
    #model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model = torch.load('../models/SST2-finetuned-BERT-v2.pt')
elif model_name == "gpt2":
    #model = GPT2DoubleHeadsModel.from_pretrained("gpt2", num_labels = 2)
    model = torch.load('../models/SST2-finetuned-GPT2-v2.pt')
    model.resize_token_embeddings(len(tokenizer))
model = model.cuda()

In [16]:
linears = [nn.Linear(768, 2).to(torch.device("cuda:0")) for  x in range(0,13)]

In [17]:
param_optimizer = [list(linears[i].named_parameters()) for i in range(0,13)]
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [[
    {'params': [p for n, p in param_optimizer[i] if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer[i] if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
] for i in range(0,13)]

optimizer = [AdamW(optimizer_grouped_parameters[i],
                     lr=2e-5,) for i in range(0,13)]
scheduler = [get_linear_schedule_with_warmup(optimizer[i], num_warmup_steps=0, num_training_steps=len(train_dataloader)*3) for i in range(0,13)]

In [18]:
if model_name=="BERT":
  model.bert.config.output_hidden_states = True
  model.bert.config.is_decoder = False
  model.bert.encoder.output_hidden_states = True
  for i in range(0,len(model.bert.encoder.layer)): 
    model.bert.encoder.layer[i].is_decoder = False
    model.bert.encoder.layer[i].output_hidden_states = True
else:
    model.transformer.output_hidden_states = True

In [19]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
# Store our loss and accuracy for plotting
train_loss_sets = [[] for i in range(0,13)]

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  for linear in linears:
    linear.train()
    
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_masks, b_labels, b_cls_positions = batch
    #b_input_ids, b_masks, b_labels = batch
    for i in range(0,13):
      optimizer[i].zero_grad()
    if(model_name=="gpt2"):
        #model.transformer.output_hidden_states = True
        outputs = model.transformer(b_input_ids, attention_mask=b_masks)
        logits = []
        for i in range(0,13):
            hl = outputs[2][i] ## We taken the all hidden states and take the l layer
            #h0 = outputs[3][0] ## Here we are using embeddings layer 
            #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
            logits.append(linears[i](hl[range(0,len(b_cls_positions)),b_cls_positions])) ## We take the classification token embedding to train the linear layer
    else:
        #h0 = model.bert.embeddings(b_input_ids)
        #logits = linear(h0.view(-1,98304))
        outputs = model.bert(b_input_ids, attention_mask=b_masks)
        logits = []
        for i in range(0,13):
            hl = outputs[2][i] ## We taken the all hidden states and take the l layer
            #h0 = outputs[3][0] ## Here we are using embeddings layer 
            #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
            logits.append(linears[i](hl[:,0])) ## We take the first token [CLS] embedding to train the linear layer

    for i in range(0,13): 
      loss_fct = CrossEntropyLoss()
      loss=loss_fct(logits[i].view(-1, logits[i].size(-1)),
                              b_labels.view(-1))
      
      train_loss_sets[i].append(loss.item())
      loss.backward(retain_graph=True)
      optimizer[i].step()
      # Update learning rate schedule
      scheduler[i].step()
    
#if model == "gpt2":
#  torch.save(linear, "/content/drive/My Drive/linear_gpt2_layer1.pt")
#else:
#  torch.save(linear,"/content/drive/My Drive/linear_BERT.pt")

Epoch: 100%|██████████| 3/3 [2:55:25<00:00, 3508.32s/it]


In [23]:
preds = [[] for i in range(0,13)]

for linear in linears:
    linear.eval()

for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_masks, b_labels, b_cls_positions = batch
    #b_input_ids, b_masks, b_labels = batch
    with torch.no_grad():
        if(model_name=="gpt2"):
            outputs = model.transformer(b_input_ids, attention_mask=b_masks)
            logits = []
            for i in range(0,13):
                hl = outputs[2][i] ## We taken the all hidden states and take the l layer
                #h0 = outputs[3][0] ## Here we are using embeddings layer 
                #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
                logits.append(linears[i](hl[range(0,len(b_cls_positions)),b_cls_positions])) ## We take the classification token embedding to t
        else:
            outputs = outputs = model.bert(b_input_ids, attention_mask=b_masks)
            logits = []
            for j in range(0,13):
                hl = outputs[2][j] ## We taken the all hidden states and take the l layer
                #h0 = outputs[3][0] ## Here we are using embeddings layer 
                #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
                logits.append(linears[j](hl[:,0])) ## We take the first token [CLS] embedding to train the linear layer
    for j in range(0,13):
        logits[j] = logits[j].detach().cpu().numpy()
        preds[j].append(logits[j])

In [24]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
for i in range(0,13):
    print("layer " + str(i))
    predictions = []
    for pred in preds[i]:
        p = np.argmax(pred, axis = 1)
        for label in p:
            predictions.append(label)
    print(classification_report(test_labels, predictions, digits = 4))
    print("Accuracy: ", accuracy_score(test_labels, predictions))

layer 0
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       428
           1     0.5092    1.0000    0.6748       444

    accuracy                         0.5092       872
   macro avg     0.2546    0.5000    0.3374       872
weighted avg     0.2593    0.5092    0.3436       872

Accuracy:  0.5091743119266054
layer 1
              precision    recall  f1-score   support

           0     0.6286    0.7710    0.6925       428
           1     0.7176    0.5608    0.6296       444

    accuracy                         0.6640       872
   macro avg     0.6731    0.6659    0.6611       872
weighted avg     0.6739    0.6640    0.6605       872

Accuracy:  0.6639908256880734
layer 2
              precision    recall  f1-score   support

           0     0.7075    0.7290    0.7181       428
           1     0.7309    0.7095    0.7200       444

    accuracy                         0.7190       872
   macro avg     0.7192    0.7192    0.7190 

  'precision', 'predicted', average, warn_for)
