In [1]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, AdamW, BertTokenizer, BertForSequenceClassification, BertModel, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset, SequentialSampler)
import torch.nn as nn
import numpy as np
from tqdm import trange
from torch.nn import CrossEntropyLoss
import copy
import torch.optim as optim
import transformers

Using TensorFlow backend.


In [2]:
directory = "../data/glue_data/MRPC/"

In [3]:
train = pd.read_csv(directory+"train.tsv", sep = "\t",  quoting = 3)
test = pd.read_csv(directory+"test.tsv", sep = "\t",   quoting = 3)
dev = pd.read_csv(directory+"dev.tsv", sep = "\t",   quoting = 3)

In [4]:
model_name = "BERT"

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
if model_name == "BERT":
    train_sentences_a = train['#1 String'].values
    train_sentences_b = train['#2 String'].values
    train_labels = train['Quality'].values
    
    test_sentences_a = dev['#1 String'].values
    test_sentences_b = dev['#2 String'].values
    test_labels = dev['Quality'].values
    
elif model_name == "gpt2":
    train_sentences_a = train['#1 String'].values
    train_sentences_b = train['#2 String'].values
    train_labels = train['Quality'].values
    
    test_sentences_a = dev['#1 String'].values
    test_sentences_b = dev['#2 String'].values
    test_labels = dev['Quality'].values

In [7]:
max_length = 128
if model_name == "BERT":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "gpt2":
    special_tokens_dict = {'bos_token':'_start_','sep_token':'[SEP]', 'cls_token': '[CLS]'}
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    tokenizer.add_special_tokens(special_tokens_dict)

In [18]:
train_input_ids = [tokenizer.encode(sent_a, text_pair=sent_b, add_special_tokens = True) for sent_a, sent_b in zip(train_sentences_a,train_sentences_b)]
test_input_ids = [tokenizer.encode(sent_a, text_pair=sent_b, add_special_tokens = True) for sent_a, sent_b in zip(test_sentences_a,test_sentences_b)]

In [19]:
MAX_LEN = 128
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [20]:
train_attention_masks = []
test_attention_masks = []

for seq in train_input_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)
    
for seq in test_input_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)

In [21]:
train_inputs = torch.tensor(train_input_ids)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [22]:
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [23]:
if model_name == "BERT":
    model = BertForSequenceClassification.from_pretrained("/tmp/BERT/MRPC", num_labels=2)
    #model = torch.load('./COLA-finetuned-BERT-v2.pt')
elif model_name == "gpt2":
    #model = GPT2DoubleHeadsModel.from_pretrained("gpt2", num_labels = 2)
    model = torch.load('./MRPC-finetuned-GPT2.pt')
    model.resize_token_embeddings(len(tokenizer))
model = model.cuda()

In [25]:
linear = nn.Linear(128*768, 2).to(torch.device("cuda:0"))

In [26]:
param_optimizer = list(linear.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5,)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*3)  # PyTorch scheduler

In [27]:
if model_name=="BERT":
  model.bert.config.output_hidden_states = True
  model.bert.config.is_decoder = False
  model.bert.encoder.output_hidden_states = True
  for i in range(0,len(model.bert.encoder.layer)): 
    model.bert.encoder.layer[i].is_decoder = False
    model.bert.encoder.layer[i].output_hidden_states = True
else:
    model.transformer.output_hidden_states = True

In [29]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
    
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    #b_input_ids, b_masks, b_labels, b_cls_positions = batch
    b_input_ids, b_masks, b_labels = batch
    optimizer.zero_grad()
    if(model_name=="gpt2"):
        outputs = model.transformer(b_input_ids, attention_mask=b_masks)
        h0 = outputs[2][0] ## Here we are using embeddings layer 
        logits = linear(h0.view(-1,98304)) ## We flat the embeddings
    else:
        outputs = model.bert(b_input_ids, attention_mask=b_masks)
        h0 = outputs[2][0] ## Here we are using embeddings layer
        logits = linear(h0.view(-1,98304)) ## We flat the embeddings
    loss_fct = CrossEntropyLoss()
    loss=loss_fct(logits.view(-1, logits.size(-1)),
                          b_labels.view(-1))
    train_loss_set.append(loss.item())
    loss.backward()
    optimizer.step()

Epoch: 100%|██████████| 3/3 [00:36<00:00, 12.12s/it]


In [31]:
preds = []

linear.eval()

for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    #b_input_ids, b_masks, b_labels, b_cls_positions = batch
    b_input_ids, b_masks, b_labels = batch
    with torch.no_grad():
        if(model_name=="gpt2"):
            outputs = model.transformer(b_input_ids, attention_mask=b_masks)
            h0 = outputs[2][0] ## Here we are using embeddings layer 
            logits = linear(h0.view(-1,98304)) ## We flat the embeddings
        else:
            outputs = outputs = model.bert(b_input_ids, attention_mask=b_masks)
            h0 = outputs[2][0] ## Here we are using embeddings layer 
            logits = linear(h0.view(-1,98304)) ## We flat the embeddings
   
        logits = logits.detach().cpu().numpy()
        preds.append(logits)

In [32]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

predictions = []
for pred in preds:
    p = np.argmax(pred, axis = 1)
    for label in p:
        predictions.append(label)
print(classification_report(test_labels, predictions, digits = 4))
print("Accuracy: ", accuracy_score(test_labels, predictions))

              precision    recall  f1-score   support

           0     0.4444    0.0930    0.1538       129
           1     0.6929    0.9462    0.8000       279

    accuracy                         0.6765       408
   macro avg     0.5687    0.5196    0.4769       408
weighted avg     0.6144    0.6765    0.5957       408

Accuracy:  0.6764705882352942
