<a href="https://colab.research.google.com/github/bvsh55/Hy-NLI/blob/main/BERT_finetuning_NLI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning BERT for NLI
(based on the tutorial by *Chris McCormick and Nick Ryan. (2019, July 22). BERT Fine-Tuning Tutorial with PyTorch. Retrieved from http://www.mccormickml.com*)

## Install and Import

In [None]:
# import tensorflow and make sure that GPU is used (google colab was used for this implementation)
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# install pytorch libraries
!pip install pytorch-pretrained-bert pytorch-nlp



In [None]:
# import further libraries
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
# specify GPU as the device to be used by torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

## Load and preprocess training data

In [None]:
# Upload the train file from the local drive
from google.colab import files
uploaded = files.upload()

Saving SICK_trial_and_train.txt to SICK_trial_and_train (1).txt


In [None]:
# Read the training file
df = pd.read_csv("SICK_trial_and_train.txt", delimiter='\t', header=0)

In [None]:
# Create sentence and label lists
sentences = df.sentence_AB.values

# Add special tokens at the beginning and end of each sentence
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.E_judgment.values

In [None]:
# Import the BERT tokenizer to convert the text into tokens
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'the', 'young', 'boys', 'are', 'playing', 'outdoors', 'and', 'the', 'man', 'is', 'smiling', 'nearby', '.', '[SEP]', 'there', 'is', 'no', 'boy', 'playing', 'outdoors', 'and', 'there', 'is', 'no', 'man', 'smiling', '.', '[SEP]']


In [None]:
# Set the maximum sequence length.
MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad the input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
# Create attention masks
attention_masks = []
segment_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    #print (seq)
    att_mask = [float(i>0) for i in seq]
    #print (att_mask)
    attention_masks.append(att_mask)
    seq_mask = []
    found = False
    for i in seq:
        if i == 0:
            seq_mask.append(1)
        elif i != 102 and found == False:
            seq_mask.append(0)
        elif i == 102 and found == False:
            seq_mask.append(0)
            found = True
        elif i != 102 and found == True:
            seq_mask.append(1)
        elif i == 102 and found == True:
            seq_mask.append(1)
    #print (seq_mask)
    segment_masks.append(seq_mask)

In [None]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1, shuffle= True)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1, shuffle= True)
segment_train_masks, segment_validation_masks, _, _ = train_test_split(segment_masks, input_ids,
                                             random_state=2018, test_size=0.1, shuffle= True)

In [None]:
# Convert all data into torch tensors

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels, dtype=torch.long)
validation_labels = torch.tensor(validation_labels, dtype=torch.long)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
segment_train_masks = torch.tensor(segment_train_masks)
segment_validation_masks = torch.tensor(segment_validation_masks)

In [None]:
# Select a batch size for training.
batch_size = 32

# Create an iterator of the data with torch DataLoader, to save memory (in comparison to a for loop)

train_data = TensorDataset(train_inputs, train_masks, segment_train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, segment_validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


## Train (fine-tune) the model

In [None]:
# Load BERTForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [None]:
# Define parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# Do the actual training

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  # Training
    
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_seg_mask, b_labels = batch
      # Clear out the gradients (by default they accumulate)
      optimizer.zero_grad()
      # Forward pass
      loss = model(b_input_ids, token_type_ids=b_seg_mask, attention_mask=b_input_mask, labels=b_labels)
      train_loss_set.append(loss.item())    
      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient
      optimizer.step()
      
      
      # Update tracking variables
      tr_loss += loss.item()
      nb_tr_examples += b_input_ids.size(0)
      nb_tr_steps += 1

print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
# Validation

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# Evaluate data for one epoch
for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_seg_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=b_seg_mask, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch: 100%|██████████| 3/3 [06:03<00:00, 121.22s/it]


Train loss: 0.21413491132622914
Validation Accuracy: 0.834375


## Predict and evaluate on test set

In [None]:
# Upload the test file from the local drive
from google.colab import files
uploaded = files.upload()

In [None]:
# Read test set
df_test = pd.read_csv("SICK_test.txt", delimiter='\t', header=0)

# Create sentence and label lists
sentences = df_test.sentence_AB.values

# Add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

labels = df_test.E_judgment.values
#print (labels)
#df_test.E_judgment.astype('int64')
#print (df_test.E_judgment.dtype)

# Tokenize the text.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# Define the max length of the sequence.
MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
segment_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    att_mask = [float(i>0) for i in seq]
    attention_masks.append(att_mask) 
    seq_mask = []
    found = False
    for i in seq:
        if i == 0:
            seq_mask.append(1)
        elif i != 102 and found == False:
            seq_mask.append(0)
        elif i == 102 and found == False:
            seq_mask.append(0)
            found = True
        elif i != 102 and found == True:
            seq_mask.append(1)
        elif i == 102 and found == True:
            seq_mask.append(1)
    #print (seq_mask)
    segment_masks.append(seq_mask)

# Convert all data to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_segment_masks = torch.tensor(segment_masks)
prediction_labels = torch.tensor(labels,dtype=torch.long)
  
batch_size = 32  

# Create an iterator of the data with torch DataLoader, to save memory (in comparison to a for loop)
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_segment_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

nb_test_steps = 0
test_accuracy = 0

# Predict 
for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_seg_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = model(b_input_ids, token_type_ids=b_seg_mask, attention_mask=b_input_mask)

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
  
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    
    test_accuracy += tmp_test_accuracy
    nb_test_steps += 1
    
print("Test Accuracy: {}".format(test_accuracy/nb_test_steps))

In [None]:
import csv

with open("SNLI_trial_and_train_BERT_results.csv", mode="w") as f:
    id = 0
    writer = csv.writer(f)
    while (id < len(test_df)):
      writer.writerow([id, predictions[id]])
f.close()

In [None]:
# Flatten the predictions and true values to output the concrete prediction of each data point.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]


for pred in flat_predictions:
    print (pred)

## Load fine-tuned model and evaluate on a given test set

In [None]:
# Choose model from local drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Load model and create state dict.
saved_model = 'sick_trial_train_corrected_bert_fine-tuned_model_NEW.pt'
path = F"/content/gdrive/My Drive/{saved_model}" 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.load_state_dict(torch.load(path))

In [None]:
# The model is loaded and can now be used for evaluation.
model.cuda()

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# To evaluate on a specific dataset, redo all steps taken within the "Predict and evaluate on test set" section above.