Import Required Libraries & Dataset

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertConfig, BertTokenizer, BertModel

# specify GPU
device = torch.device("cuda")

In [None]:
df = pd.read_csv("spamdata_v2.csv")
df.head()

In [None]:
# check class distribution
df['label'].value_counts(normalize = True)

Split the Dataset into train / test

In [None]:
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

print(train_text.shape)
print(temp_text.shape)
print(val_text.shape)

Import Bert - base- uncased

In [None]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
# bert
tokenizer

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train_text]

pd.Series(seq_len).hist(bins = 30)

Tokenize & Encode the Sequences

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

print(len(tokens_train['input_ids']))
# 3900 sentences in the training set, that will be divided in 121 batches of size 32 plus 1 last batch of size 28 (see code below)

print(tokens_train.keys())
print(train_text.tolist()[44])
print(tokens_train['input_ids'][44])
print(tokens_train['token_type_ids'][44])
print(tokens_train['attention_mask'][44])

Test Examples

In [None]:
print("train text:", train_text.tolist()[8])
for key, value in tokens_train.items():
    print( 'KEY: {}\nVALUE: {}\nDECODE: {}\nCONVERTtoTOKENS: {}\nLABEL: {}'.
          format(key, value[8], tokenizer.decode(value[8]),tokenizer.convert_ids_to_tokens(value[8]),train_labels.values[8])) 
    
print("----")    

similar = ["Sorry, I'lll call later"]
print("similar text:", similar)
token_similar = tokenizer.batch_encode_plus(list(similar), max_length = 25, pad_to_max_length=True, truncation=True) #padding = True
for key, value in token_similar.items():
    print( 'KEY: {}\nVALUE: {}\nDECODE: {}\nCONVERTtoTOKENS: {}'.
          format(key, value[0], tokenizer.decode(value[0]),tokenizer.convert_ids_to_tokens(value[0]))) 

print("----------------------------------------------------------------------------------")    

Himalayas = "where is Himalayas in the world map?"
print("Himalayas text:", Himalayas)
token_Himalayas = tokenizer.encode_plus(Himalayas)
for key, value in token_Himalayas.items():
    print( 'KEY: {}\nVALUE: {}\nDECODE: {}\nCONVERTtoTOKENS: {}'.
          format(key, value, tokenizer.decode(value),tokenizer.convert_ids_to_tokens(value))) 
    
print("----")    

Himalayass = "where is Himalayass in the world map?"
print("Himalayass text:", Himalayass)
token_Himalayass = tokenizer.encode_plus(Himalayass)
for key, value in token_Himalayass.items():
    print( 'KEY: {}\nVALUE: {}\nDECODE: {}\nCONVERTtoTOKENS: {}'.
          format(key, value, tokenizer.decode(value),tokenizer.convert_ids_to_tokens(value))) 
    
print("----------------------------------------------------------------------------------")

q1 = 'Who was Tony Stark?'
c1 = 'Anthony Edward Stark known as Tony Stark is a fictional character in Avengers'
print("double text:", q1, c1)
encoding = tokenizer.encode_plus(q1, c1)
for key, value in encoding.items():
    print( 'KEY: {}\nVALUE: {}\nDECODE: {}\nCONVERTtoTOKENS: {}'.
          format(key, value, tokenizer.decode(value),tokenizer.convert_ids_to_tokens(value))) 
    

List to Tensors

In [None]:
## convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids']).cuda()
train_ids = torch.tensor(tokens_train['token_type_ids']).cuda()
train_mask = torch.tensor(tokens_train['attention_mask']).cuda()
train_y = torch.tensor(train_labels.tolist()).cuda()

val_seq = torch.tensor(tokens_val['input_ids']).cuda()
val_ids = torch.tensor(tokens_val['token_type_ids']).cuda()
val_mask = torch.tensor(tokens_val['attention_mask']).cuda()
val_y = torch.tensor(val_labels.tolist()).cuda()

test_seq = torch.tensor(tokens_test['input_ids']).cuda()
test_ids = torch.tensor(tokens_test['token_type_ids']).cuda()
test_mask = torch.tensor(tokens_test['attention_mask']).cuda()
test_y = torch.tensor(test_labels.tolist()).cuda()

#example

sent_idx = 7
print("train text:", train_text.tolist()[sent_idx])
for key, value in tokens_train.items():
    print( 'KEY: {}\nVALUE: {}\nDECODE: {}\nCONVERTtoTOKENS: {}\nLABEL: {}'.
          format(key, value[sent_idx], tokenizer.decode(value[sent_idx]),tokenizer.convert_ids_to_tokens(value[sent_idx]),test_labels.values[sent_idx]))     
print('-------------')    
print(test_seq[sent_idx])
print(test_ids[sent_idx])
print(test_mask[sent_idx])
print(test_y[sent_idx])

Data Loader

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

#Every DataLoader has a Sampler which is used internally to get the indices for each batch

# sampler for sampling the data during training (shuffled random indexes)
train_sampler = RandomSampler(train_data)
count = 0
for i in train_sampler:
    # iterating over the RandomSampler
        print('count:',count)
        print(i)
        count +=1
        if (count >= 5):
            break
    
# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during validation (non shuffled sequential indexes)
val_sampler = SequentialSampler(val_data)
count = 0
for i in val_sampler:
    # iterating over the SequentialSampler
        print('count:',count)
        print(i)
        count +=1
        if (count >= 5):
            break


    
# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

print(train_data)
print(train_sampler)
print(train_dataloader)
print(val_data)
print(val_sampler)
print(val_dataloader)



In [None]:
for ids, mask, label in val_dataloader: #sequential indexes
    print('index:',ids[0],'\nmask:',mask[0],'\nlabel:',label[0])
    break

In [None]:
for ids, mask, label in train_dataloader: #shuffled indexes
    print('index:',ids[0],'\nmask:',mask[0],'\nlabel:',label[0])
    break

Model Architecture

In [None]:
# freeze all the parameters
#In some cases, you might be interested in keeping the weights of the pre-trained encoder frozen 
#and optimizing only the weights of the head layers

#Take layers from a previously trained model. 
#Freeze them, so as to avoid destroying any of the information they contain during future training rounds. 
#Add some new, trainable layers on top of the frozen layers. 
#They will learn to turn the old features into predictions on a new dataset.
#Freezing reduces training time as the backward passes go down in number
#you only need to backpropagate the gradient and update the weights of the not freezed layers

for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        
        self.bert = bert 
        
        # dropout layer
        self.dropout = nn.Dropout(0.1)
      
        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)
      
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):
        
        #pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
      
        # apply softmax activation
        x = self.softmax(x)

        return x

In [None]:
# pass the pre-trained BERT to our defined architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [None]:
train_seq.device

In [None]:
# for i in model.parameters():
#     print(i.is_cuda)

In [None]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),lr = 1e-5) 

In [None]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight(class_weight ='balanced', classes = np.unique(train_labels), y = train_labels)

print("Class Weights:",class_weights)

In [None]:
# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 10

Fine-Tune

In [None]:
# function to train the model
def train():
    
    model.train() # To train the model, you should first set it back in training mode with model.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        # progress update after every 30 batches.
        if step % 30 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch
        
        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch - average per epoch
    avg_loss = total_loss / len(train_dataloader)
  
      # predictions are in the form of (no. of batches, size of batch, no. of classes).
      # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval() # is a kind of switch for some specific layers/parts of the model 
                 # that behave differently during training and inference (evaluating) time. 
                 # For example, Dropouts Layers, BatchNorm Layers etc. 
                 # You need to turn them off during model evaluation, and .eval() will do it for you

    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        
        # Progress update every 30 batches.
        if step % 30 == 0 and not step == 0:
            
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad(): #we don't compute or use gradients during evaluation, so turning off the autograd will speed up execution and will reduce memory usage
            
            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
pred_perc_list = []

for step,batch in enumerate(train_dataloader):
    sent_id = batch[0]
    # print(sent_id)
    mask = batch[1]
    # print(mask)
    label = batch[2]
    # print(label)
    
# prediction_percent are in the form of:      no. of batches (122), size of batch (32) , no. of classes (2).
# reshape the predictions_percent in form of: number of samples (3900), no. of classes(2).
    
    pred_perc = model(sent_id, mask) # tensor of 32, as batch size
    pred_perc = pred_perc.detach().cpu().numpy()
    print('batch #', len(pred_perc_list), pred_perc.shape)    
    pred_perc_list.append(pred_perc)  
    # break    
    
pred_perc_list_reshaped  = np.concatenate(pred_perc_list, axis=0)
print('pred percent list length:',len(pred_perc_list))
print('pred percent list reshaped:', pred_perc_list_reshaped.shape)
print('last pred perc:', pred_perc.shape, '\n', pred_perc) #last one


In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

Make predictions

In [None]:
# get predictions for test data
with torch.no_grad(): #no gradients computation needed
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

In [None]:
# model's performance
test_y = test_y.detach().cpu().numpy()
preds_01 = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds_01))

print(type(test_y))
print(type(preds_01))