In [1]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, AdamW, BertTokenizer, BertForSequenceClassification, BertModel, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset, SequentialSampler)
import torch.nn as nn
import numpy as np
from tqdm import trange
from torch.nn import CrossEntropyLoss
import copy
import torch.optim as optim
import transformers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
directory = "../data/CoLA/"

In [3]:
train = pd.read_csv(directory+"train.tsv", sep = "\t", header=None)
test = pd.read_csv(directory+"test.tsv", sep = "\t")
dev = pd.read_csv(directory+"dev.tsv", sep = "\t", header=None)

In [4]:
model_name = "BERT"

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
if model_name == "BERT":
    train_sentences = train[3].values
    #train_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in train_sentences]
    train_labels = train[1].values

    test_sentences = dev[3].values
    #test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]
    test_labels = dev[1].values
    
elif model_name == "gpt2":
    train_sentences = train[3].values
    #train_sentences = ["_start_ " + sentence + " _classify_" for sentence in train_sentences]
    train_labels = train[1].values
    
    test_sentences = dev[3].values
    #test_sentences = ["_start_ " + sentence + " _classify_" for sentence in test_sentences]
    test_labels = dev[1].values

In [7]:
max_length = 128
if model_name == "BERT":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "gpt2":
    special_tokens_dict = {'cls_token': '_classify_','bos_token': '_start_'}
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    tokenizer.add_special_tokens(special_tokens_dict)

In [8]:
train_input_ids = [tokenizer.encode(sent) for sent in train_sentences]
test_input_ids = [tokenizer.encode(sent) for sent in test_sentences]

In [12]:
start = tokenizer.convert_tokens_to_ids("_start_")
classify = tokenizer.convert_tokens_to_ids("_classify_")
train_input_ids = [[start] + tokenizer.encode(sent)[:(max_length-2)] + [classify] for sent in train_sentences]
test_input_ids = [[start] + tokenizer.encode(sent)[:(max_length-2)] + [classify] for sent in test_sentences]

In [9]:
MAX_LEN = 128
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
cls_position_train = np.argmax(train_input_ids==classify,axis=1)
cls_position_test = np.argmax(test_input_ids==classify,axis=1)

In [10]:
train_attention_masks = []
test_attention_masks = []

for seq in train_input_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)
    
for seq in test_input_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)

In [11]:
train_inputs = torch.tensor(train_input_ids)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)
#train_cls_positions= torch.tensor(cls_position_train)

test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)
#test_cls_positions=torch.tensor(cls_position_test)

In [12]:
batch_size = 16
#train_data = TensorDataset(train_inputs, train_masks, train_labels, train_cls_positions)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#test_data = TensorDataset(test_inputs, test_masks, test_labels, test_cls_positions)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [13]:
if model_name == "BERT":
    #model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model = torch.load('../models/COLA-finetuned-BERT-v2.pt')
elif model_name == "gpt2":
    #model = GPT2DoubleHeadsModel.from_pretrained("gpt2", num_labels = 2)
    model = torch.load('../models/COLA-finetuned-GPT2.pt')
    model.resize_token_embeddings(len(tokenizer))
model = model.cuda()



In [30]:
linear = nn.Linear(128*768, 2).to(torch.device("cuda:0"))

In [31]:
param_optimizer = list(linear.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5,)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*3)  # PyTorch scheduler

In [16]:
if model_name=="BERT":
  model.bert.config.output_hidden_states = True
  model.bert.config.is_decoder = False
  model.bert.encoder.output_hidden_states = True
  for i in range(0,len(model.bert.encoder.layer)): 
    model.bert.encoder.layer[i].is_decoder = False
    model.bert.encoder.layer[i].output_hidden_states = True
else:
    model.transformer.output_hidden_states = True


In [32]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
    
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    #b_input_ids, b_masks, b_labels, b_cls_positions = batch
    b_input_ids, b_masks, b_labels = batch
    optimizer.zero_grad()
    if(model_name=="gpt2"):
        outputs = model.transformer(b_input_ids, attention_mask=b_masks)
        logits = []
        h0 = outputs[2][0] ## Here we are using embeddings layer 
        logits.append(linears[i](hl[range(0,len(b_cls_positions)),b_cls_positions])) ## We take the classification token embedding to train the linear layer
    else:
        outputs = model.bert(b_input_ids, attention_mask=b_masks)
        h0 = outputs[2][0] ## Here we are using embeddings layer
        logits = linear(h0.view(-1,98304)) ## We flat the embeddings
    loss_fct = CrossEntropyLoss()
    loss=loss_fct(logits.view(-1, logits.size(-1)),
                          b_labels.view(-1))
    train_loss_set.append(loss.item())
    loss.backward()
    optimizer.step()
    
#if model == "gpt2":
#  torch.save(linear, "/content/drive/My Drive/linear_gpt2_layer1.pt")
#else:
#  torch.save(linear,"/content/drive/My Drive/linear_BERT.pt")

Epoch: 100%|██████████| 3/3 [01:42<00:00, 34.28s/it]


In [36]:
preds = []

linear.eval()

for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    #b_input_ids, b_masks, b_labels, b_cls_positions = batch
    b_input_ids, b_masks, b_labels = batch
    with torch.no_grad():
        if(model_name=="gpt2"):
            outputs = model.transformer(b_input_ids, attention_mask=b_masks)
            logits = []
            for i in range(0,13):
                hl = outputs[2][i] ## We taken the all hidden states and take the l layer
                #h0 = outputs[3][0] ## Here we are using embeddings layer 
                #logits = linear(h0.view(-1,98304)) ## We flat the embeddings
                logits.append(linears[i](hl[range(0,len(b_cls_positions)),b_cls_positions])) ## We take the classification token embedding to t
        else:
            outputs = outputs = model.bert(b_input_ids, attention_mask=b_masks)
            h0 = outputs[2][0] ## Here we are using embeddings layer 
            logits = linear(h0.view(-1,98304)) ## We flat the embeddings
   
        logits = logits.detach().cpu().numpy()
        preds.append(logits)

In [37]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

predictions = []
for pred in preds:
    p = np.argmax(pred, axis = 1)
    for label in p:
        predictions.append(label)
print(classification_report(test_labels, predictions, digits = 4))
print("Accuracy: ", accuracy_score(test_labels, predictions))

              precision    recall  f1-score   support

           0     0.3557    0.7081    0.4735       322
           1     0.7662    0.4272    0.5485       721

    accuracy                         0.5139      1043
   macro avg     0.5609    0.5676    0.5110      1043
weighted avg     0.6394    0.5139    0.5254      1043

Accuracy:  0.513902205177373
