In [2]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, AdamW, BertTokenizer, BertForSequenceClassification, BertModel, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset, SequentialSampler)
import torch.nn as nn
import numpy as np
from tqdm import trange
from torch.nn import CrossEntropyLoss
import copy
import torch.optim as optim
import transformers

Using TensorFlow backend.


In [3]:
directory = "../data/glue_data/MRPC/"

In [4]:
train = pd.read_csv(directory+"train.tsv", sep = "\t",  quoting = 3)
test = pd.read_csv(directory+"test.tsv", sep = "\t",   quoting = 3)
dev = pd.read_csv(directory+"dev.tsv", sep = "\t",   quoting = 3)

In [5]:
model_name = "gpt2"

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
if model_name == "BERT":
    train_sentences = train[3].values
    #train_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in train_sentences]
    train_labels = train[1].values

    test_sentences = dev[3].values
    #test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]
    test_labels = dev[1].values
    
elif model_name == "gpt2":
    train_sentences_a = train['#1 String'].values
    train_sentences_b = train['#2 String'].values
    train_labels = train['Quality'].values
    
    test_sentences_a = dev['#1 String'].values
    test_sentences_b = dev['#2 String'].values
    test_labels = dev['Quality'].values

In [8]:
max_length = 128
if model_name == "BERT":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "gpt2":
    special_tokens_dict = {'bos_token':'_start_','sep_token':'[SEP]', 'cls_token': '[CLS]'}
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    tokenizer.add_special_tokens(special_tokens_dict)

In [9]:
start = tokenizer.convert_tokens_to_ids("_start_")
classify = tokenizer.convert_tokens_to_ids("[CLS]")
sep = tokenizer.convert_tokens_to_ids("[SEP]")

train_sentences_a_tokens = [tokenizer.encode(sent_a) for sent_a in train_sentences_a]
train_sentences_b_tokens = [tokenizer.encode(sent_b) for sent_b in train_sentences_b]

test_sentences_a_tokens = [tokenizer.encode(sent_a) for sent_a in test_sentences_a]
test_sentences_b_tokens = [tokenizer.encode(sent_b) for sent_b in test_sentences_b]

train_input_ids = [[start] + sent_a[:max_length-3] + [sep] + sent_b[:(max_length-(3+len(sent_a[:max_length-3])))] + [classify] for sent_a,sent_b in zip(train_sentences_a_tokens,train_sentences_b_tokens)]
test_input_ids = [[start] + sent_a[:max_length-3] + [sep] + sent_b[:(max_length-(3+len(sent_a[:max_length-3])))] + [classify] for sent_a,sent_b in zip(test_sentences_a_tokens,test_sentences_b_tokens)]

In [10]:
MAX_LEN = 128
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [11]:
cls_position_train = np.argmax(train_input_ids==classify,axis=1)
cls_position_test = np.argmax(test_input_ids==classify,axis=1)

In [12]:
train_attention_masks = []
test_attention_masks = []

for seq in train_input_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)
    
for seq in test_input_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)

In [13]:
train_inputs = torch.tensor(train_input_ids)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)
train_cls_positions= torch.tensor(cls_position_train)

test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)
test_cls_positions=torch.tensor(cls_position_test)

In [14]:
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_cls_positions)
#train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels, test_cls_positions)
#test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [15]:
if model_name == "BERT":
    #model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model = torch.load('../models/COLA-finetuned-BERT-v2.pt')
elif model_name == "gpt2":
    #model = GPT2DoubleHeadsModel.from_pretrained("gpt2", num_labels = 2)
    model = torch.load('../models/MRPC-finetuned-GPT2.pt')
    model.resize_token_embeddings(len(tokenizer))
model = model.cuda()



In [16]:
linears = [nn.Linear(768, 2).to(torch.device("cuda:0")) for  x in range(0,13)]

In [17]:
param_optimizer = [list(linears[i].named_parameters()) for i in range(0,13)]
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [[
    {'params': [p for n, p in param_optimizer[i] if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer[i] if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
] for i in range(0,13)]

optimizer = [AdamW(optimizer_grouped_parameters[i],
                     lr=2e-5) for i in range(0,13)]

scheduler = [get_linear_schedule_with_warmup(optimizer[i], num_warmup_steps=0, num_training_steps=len(train_dataloader)*3) for i in range (0,13)]

In [18]:
if model_name=="BERT":
  model.bert.config.output_hidden_states = True
  model.bert.config.is_decoder = False
  model.bert.encoder.output_hidden_states = True
  for i in range(0,len(model.bert.encoder.layer)): 
    model.bert.encoder.layer[i].is_decoder = False
    model.bert.encoder.layer[i].output_hidden_states = True
else:
    model.transformer.output_hidden_states = True

In [20]:
# Store our loss and accuracy for plotting
train_loss_sets = [[] for i in range(0,13)]

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  for linear in linears:
    linear.train()
    
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_masks, b_labels, b_cls_positions = batch
    #b_input_ids, b_masks, b_labels = batch
    for i in range(0,13):
      optimizer[i].zero_grad()
    if(model_name=="gpt2"):
        #model.transformer.output_hidden_states = True
        outputs = model.transformer(b_input_ids, attention_mask=b_masks)
        logits = []
        for i in range(0,13):
            hl = outputs[2][i] ## We taken the all hidden states and take the l layer
            #h0 = outputs[3][0] ## Here we are using embeddings layer 
            #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
            logits.append(linears[i](hl[range(0,len(b_cls_positions)),b_cls_positions])) ## We take the classification token embedding to train the linear layer
    else:
        #h0 = model.bert.embeddings(b_input_ids)
        #logits = linear(h0.view(-1,98304))
        outputs = model.bert(b_input_ids, attention_mask=b_masks)
        logits = []
        for i in range(0,13):
            hl = outputs[2][i] ## We taken the all hidden states and take the l layer
            #h0 = outputs[3][0] ## Here we are using embeddings layer 
            #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
            logits.append(linears[i](hl[:,0])) ## We take the first token [CLS] embedding to train the linear layer

    for i in range(0,13): 
      loss_fct = CrossEntropyLoss()
      loss=loss_fct(logits[i].view(-1, logits[i].size(-1)),
                              b_labels.view(-1))
      
      train_loss_sets[i].append(loss.item())
      loss.backward(retain_graph=True)
      optimizer[i].step()
      # Update learning rate schedule
      scheduler[i].step()
    
#if model == "gpt2":
#  torch.save(linear, "/content/drive/My Drive/linear_gpt2_layer1.pt")
#else:
#  torch.save(linear,"/content/drive/My Drive/linear_BERT.pt")

Epoch: 100%|██████████| 3/3 [07:56<00:00, 158.73s/it]


In [21]:
preds = [[] for i in range(0,13)]

for linear in linears:
    linear.eval()

for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_masks, b_labels, b_cls_positions = batch
    #b_input_ids, b_masks, b_labels = batch
    with torch.no_grad():
        if(model_name=="gpt2"):
            outputs = model.transformer(b_input_ids, attention_mask=b_masks)
            logits = []
            for i in range(0,13):
                hl = outputs[2][i] ## We taken the all hidden states and take the l layer
                #h0 = outputs[3][0] ## Here we are using embeddings layer 
                #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
                logits.append(linears[i](hl[range(0,len(b_cls_positions)),b_cls_positions])) ## We take the classification token embedding to t
        else:
            outputs = outputs = model.bert(b_input_ids, attention_mask=b_masks)
            logits = []
            for j in range(0,13):
                hl = outputs[2][j] ## We taken the all hidden states and take the l layer
                #h0 = outputs[3][0] ## Here we are using embeddings layer 
                #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
                logits.append(linears[j](hl[:,0])) ## We take the first token [CLS] embedding to train the linear layer
    for j in range(0,13):
        logits[j] = logits[j].detach().cpu().numpy()
        preds[j].append(logits[j])

In [22]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
for i in range(0,13):
    print("layer " + str(i))
    predictions = []
    for pred in preds[i]:
        p = np.argmax(pred, axis = 1)
        for label in p:
            predictions.append(label)
    print(classification_report(test_labels, predictions, digits = 4))
    print("Accuracy: ", accuracy_score(test_labels, predictions))

layer 0
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       129
           1     0.6838    1.0000    0.8122       279

    accuracy                         0.6838       408
   macro avg     0.3419    0.5000    0.4061       408
weighted avg     0.4676    0.6838    0.5554       408

Accuracy:  0.6838235294117647
layer 1
              precision    recall  f1-score   support

           0     0.4571    0.1240    0.1951       129
           1     0.6971    0.9319    0.7975       279

    accuracy                         0.6765       408
   macro avg     0.5771    0.5280    0.4963       408
weighted avg     0.6212    0.6765    0.6071       408

Accuracy:  0.6764705882352942
layer 2
              precision    recall  f1-score   support

           0     0.4375    0.0543    0.0966       129
           1     0.6888    0.9677    0.8048       279

    accuracy                         0.6789       408
   macro avg     0.5631    0.5110    0.4507 

  'precision', 'predicted', average, warn_for)
