### Finetune BERT


##### I. Import the modules

In [38]:
import io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange  #for progress bars

#from IPython.display import Image #for image rendering

##### II. Specify CUDA as the device for torch

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Display information about the NVIDIA GPU drivers and hardware
!nvidia-smi

/bin/bash: nvidia-smi: command not found


##### III. Loading the dataset

In [6]:
!curl -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-and-Computer-Vision-3rd-Edition/master/Chapter05/in_domain_train.tsv --output "in_domain_train.tsv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  428k  100  428k    0     0   699k      0 --:--:-- --:--:-- --:--:--  701k


In [41]:
#df = pd.read_excel('C:/Users/JackCarey/Documents/Noor/FILENAME.xlsx')
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [42]:
#Print shape
df.shape

(8551, 4)

In [9]:
#Print sample
df.sample(10)

Unnamed: 0,sentence_source,label,label_notes,sentence
108,cj99,1,,i 'm not shocked by the idea that the more you...
2393,l-93,1,,i dried the clothes in the sun .
4973,ks08,0,*,which rebel leader did you hear cheney 's rumo...
6157,c_13,1,,bill wants john to leave .
3971,ks08,1,,we saw him beaten by the champion .
6763,m_02,0,*,"because into the room came aunt norris , fanny..."
7247,sks13,1,,john knows that she left and john knows whethe...
2716,l-93,1,,nora pushed her way through the crowd .
3957,ks08,1,,tom locked fido in the garage .
4759,ks08,1,,"edward 's help , you can rely on ."


##### IV. Creating sentences, label lists, and adding BERT tokens

In [None]:
# In Pandas we access a column in a DataFrame using the dot notation (df.column_name) or the bracket notation (df['column_name'])
sentences = df.sentence.values
#print(repr(sentences)) #NumPy array of strings including the comma's

#Adding CLS and SEP tokens at the beginning and end of each sentence for BERT
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
#print(sentences)

# Extracting the labels from the DataFrame
labels = df.label.values
#print(labels)

##### V. Creating an instance of the BertTokenizer class

In [18]:
try:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    print("Tokenizer downloaded successfully.")
except Exception as e:
    print("An error occurred while downloading the tokenizer.")
    print(str(e)) #Extract the error message
    import traceback
    print(traceback.format_exc()) #Gives more context about the error message

Tokenizer downloaded successfully.


In [19]:
#Use the tokenize() method to tokenize each sentence in sentences
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'our', 'friends', 'wo', 'n', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


##### VI. Set max_len, convert tokens to IDs (index numbers in BERT vocabulary), and pad sentences

In [None]:
#Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
#In the original paper, the authors used a length of 512.
MAX_LEN = 128

#Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
#print(input_ids)

#Pad the input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
#print(input_ids) #Remember: the special token <PAD> is represented by 0 in the BERT vocabulary

##### VII. Create attention masks

In [27]:
attention_masks = []

#Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq] #Conditional statement: float(TRUE) > 1.0
  attention_masks.append(seq_mask)

#### VIII. Split data into training and validation datasets

In [None]:
#Training and validation inputs and labels
#10% of the data (input_ids) is used for validation_inputs
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                                                    random_state=123, test_size=0.1)
#Training and validation masks
#We are only focusing on the first two arrays (train_masks and validation_masks), which are splits of attention_masks
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=123, test_size=0.1)

#### IX. Converting data into torch tensors

In [29]:
#Focus: Training dataset
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

#Focus: Validation dataset
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

#### X. Select batch size and create iterator

In [36]:
#Select a batch size for training (recommended for finetuning: 16 or 32) 
batch_size = 32

#Focus: Training dataset
#I. Create a train_data dataset from the tensors: train_inputs, train_labels and train_masks
train_data = TensorDataset(train_inputs, train_masks, train_labels)
#II. Create a sampler (iterator) that determines how batches are selected from the train_data (here: random)
train_sampler = RandomSampler(train_data)
#III. Create batches using DataLoader
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#Print statement to understand the structure of train_dataloader
print("DataLoader (train_dataloader):")
for batch in train_dataloader:
    print(batch)
    break

#Focus: Validation dataset
#I. Create a validation_data dataset from the tensors: validation_inputs, validation_labels and validation_masks
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
#II. Create a sampler (iterator) that determines how batches are selected from the validation_data (here: random)
validation_sampler = SequentialSampler(validation_data)
#III. Create batches using DataLoader
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


DataLoader (train_dataloader):
[tensor([[  101, 13097, 26393,  ...,     0,     0,     0],
        [  101,  2009,  2001,  ...,     0,     0,     0],
        [  101,  2198,  2387,  ...,     0,     0,     0],
        ...,
        [  101,  2198,  5720,  ...,     0,     0,     0],
        [  101,  2585,  8823,  ...,     0,     0,     0],
        [  101,  1037,  3899,  ...,     0,     0,     0]]), tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]]), tensor([0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
        1, 1, 0, 1, 1, 1, 1, 1])]


#### XI. Approach 1: Initialize a BERT model from scratch

In [43]:
from transformers import BertModel, BertConfig

#Create configuration object that defines architecture and hyperparameters of BERT model
#This loads the default configuration for a BERT model
configuration = BertConfig()

#Initializing a model using the bert-base-uncased style configuration
model = BertModel(configuration)

#Accessing the model configuration
configuration = model.config
print(configuration)

BertConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



#### XII. Approach 2: Downloading a pre-trained BERT model

In [44]:
#Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = nn.DataParallel(model)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=7

#### XIII. Group model parameters for L2 regularization

In [50]:
#Save model's parameters
model_parameters = list(model.named_parameters())

#Create no_decay: a list of two strings
no_decay = ['bias', 'LayerNorm.weight']

grouped_model_parameters = [
    #p refers to the parameter tensor, n refers to name of the parameter
    {'params': [p for n, p in model_parameters if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},

    {'params': [p for n, p in model_parameters if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

#### XIV. Set hyperparameters

In [52]:
#I. Number of training epochs
epochs = 4

#II. Optimizer
from torch.optim import AdamW

#The AdamW optimizer is a variant of the Adam optimizer that includes weight decay (L2 regularization)
#The weight decay is applied directly during the optimization step
optimizer = AdamW(grouped_model_parameters,
                  lr = 2e-5, 
                  eps = 1e-8
                  )

#III. Total number of training steps (number of batches * number of epochs)
total_steps = len(train_dataloader) * epochs

#IV. Learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
#Create function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

#### XV. Training loop

In [None]:
#Initialize two empty lists
t = []
train_loss_set = []

for _ in trange(epochs, desc="Epoch"):

  #I. Training

  #Set model to training mode
  model.train()

  #Tracking variables
  tr_loss = 0 #training loss
  nb_tr_examples, nb_tr_steps = 0, 0 #number of training examples/steps

  #Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    #Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    #Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    #Clear out the gradients
    optimizer.zero_grad()

    #Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs['loss']
    train_loss_set.append(loss.item())

    #Compute the gradients of the loss with respect to the model's parameters
    loss.backward()

    #Update model's parameters
    optimizer.step()

    #Update the learning rate
    scheduler.step()

    #Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0) #number of examples processed (accumulates the batches)
    nb_tr_steps += 1

  #Calculate the average loss across the batches processed so far
  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  #II. Validation

  #Set model to evaluation mode
  model.eval()

  #Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  #Evaluate data for one epoch
  for batch in validation_dataloader:
    #Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    #Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    #Telling the model not to compute/store gradients, saving memory and speeding up validation
    with torch.no_grad():
      #Forward pass (no labels = logit predictions only)
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    #Move logits and labels to CPU
    logits = outputs['logits'].detach().cpu().numpy() #Note to self: I believe we don't need detach() here as we used no_grad to calculate the outputs
    label_ids = b_labels.to('cpu').numpy()

    #Calculate the accuracy of predictions vs labels
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    
  #Print the average accuracy over the batches processed so far
  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

#### XVI. Evaluate on Out-Of-Domain (OOD) test data

##### I. Load the dataset

In [None]:
#Load the dataset
df = pd.read_csv("out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [None]:
#Print the shape
df.shape

##### II. Create sentences, label lists and add BERT tokens

In [None]:
#Create sentence and label lists
sentences = df.sentence.values

#Add [CLS] and [SEP] tokens at the beginning and end of each sentence for BERT
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

#Extract the labels from the DataFrame
labels = df.label.values

##### III. Tokenize the sentences

In [None]:
#Use tokenize()
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

##### IV. Set max_len, convert tokens to IDs (index numbers in BERT vocabulary), and pad sentences

In [56]:
#Set max_len
MAX_LEN = 128

#Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

#Pad the input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

##### V. Create attention masks

In [54]:
#Create attention masks
attention_masks = []

#Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

##### VI. Convert data into torch tensors

In [55]:
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

##### VII. Select batch size and create iterator

In [None]:
#Select batch size
batch_size = 32

#I. Create a prediction_data from the tensors: prediction_inputs, prediction_masks and prediction_labels
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)

#II. Create a sampler (iterator) that determines how batches are selected from prediction_data (here: random)
prediction_sampler = SequentialSampler(prediction_data)

#III. Create batches using DataLoader
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

##### VIII. Define softmax function

In [None]:
#Softmax logits
import numpy as np

def softmax(logits):
    e = np.exp(logits)
    return e / np.sum(e)

##### IX. Run model

In [None]:
import torch
import numpy as np

#Set model to evaluation mode
model.eval()

#Tracking variables
raw_predictions, predicted_classes, true_labels = [], [], []

#Evaluate data for one epoch
for batch in prediction_dataloader:
  #Add batch to GPU
  batch = tuple(t.to(device) for t in batch)

  #Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch

  #Telling the model not to compute/store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    #Forward pass (no labels = logit predictions only)
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  #Move logits, labels, and input_ids to CPU
  logits = outputs['logits'].detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  b_input_ids = b_input_ids.to('cpu').numpy()
  
  #Convert input_ids back to words
  batch_sentences = [tokenizer.decode(input_ids, skip_special_tokens=True) for input_ids in b_input_ids]

  #Apply softmax function to convert logits into probabilities
  probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)

  #The predicted class is the one with the highest probability
  batch_predictions = np.argmax(probabilities, axis=1)

  #Print the sentences and the corresponding predictions for this batch
  for i, sentence in enumerate(batch_sentences):
    print(f"Sentence: {sentence}")
    print(f"Prediction: {logits[i]}")
    print(f"Sofmax probabilities", softmax(logits[i]))
    print(f"Prediction: {batch_predictions[i]}")
    print(f"True label: {label_ids[i]}")

  #Store raw predictions, predicted classes and true labels
  raw_predictions.append(logits)
  predicted_classes.append(batch_predictions)
  true_labels.append(label_ids)

##### X. Calculate Matthews Correlation Coefficient

In [None]:
#I. Matthews Correlation Coefficient per batch
from sklearn.metrics import matthews_corrcoef

#Initialize an empty list
matthews_set = []

#Iterate over each batch
for i in range(len(true_labels)):
  
  #Calculate Matthews correlation coefficient for each batch
  matthews = matthews_corrcoef(true_labels[i], predicted_classes[i])

  #Add the result to the matthews_set list
  matthews_set.append(matthews)

In [None]:
#II. Matthews Correlation Coefficient for the entire evaluation set
from sklearn.metrics import matthews_corrcoef

#Create one-dimensional list the true_labels and predicted_classes
#List comprehension with nested loop (!):
#[extracted_value for outer_loop_variable in outer_loop_iterable for inner_loop_variable in inner_loop_iterable]
true_labels_total = [label for batch in true_labels for label in batch]
predicted_classes_total = [pred for batch in predicted_classes for pred in batch]

#Calculate the MCC for the entire set of predictions
mcc = matthews_corrcoef(true_labels_total, predicted_classes_total)

print(f"MCC: {mcc}")