# EDA and Preprocessing

In [1]:
#Importing Libraries

import torch
import pandas as pd
from tqdm.notebook import tqdm  #for status bar
import os

#Check if GPU is available
torch.cuda.is_available()

True

In [2]:
#Using SMILE Twitter dataset
df = pd.read_csv(os.path.join(os.getcwd(),'data\smile-annotations-final.csv'), names = ['id','text' ,'category'])
df.set_index('id', inplace= True)

In [3]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|angry               2
sad|disgust             2
sad|disgust|angry       1
Name: category, dtype: int64

In [4]:
#Removing the no code and multiple categorical columns
df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [5]:
#Creating an encoding dict for the categories
labels = df.category.unique()
label_dict = {}
for i,label in enumerate(labels):
    label_dict[label] = i

label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [6]:
#Adding the encoding label to the dataset
df['label'] = df.category.replace(label_dict)
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


# Training and Validation Split 

In [7]:
#Look at the data split at different categories
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [8]:
#Since there are a few categories with very few samples, we do a stratified split for a proper representation in train and valid
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size = 0.15,
                                                  random_state = 7,
                                                 stratify = df.label.values)

In [9]:
#Creating a data_type column in dataframe to identify test and train
df['data_type'] = 'not_set' * df.shape[0]

In [10]:
# Setting the datatype to train or valid based on split
df.loc[X_train , 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'valid'

In [11]:
#Checking if the stratified split is working
df.groupby(['category' , 'label' , 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,valid,9
disgust,3,train,5
disgust,3,valid,1
happy,0,train,966
happy,0,valid,171
not-relevant,1,train,182
not-relevant,1,valid,32
sad,4,train,27
sad,4,valid,5


# Loading Tokenizer and Encoding the Data

In [12]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [13]:
# Loading the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',  # Uses the lower cased version of BERT
                                          do_lower_case = True, # Converts our sentences to lower case                             
)

In [14]:
# Encoding the data
#Training encoder
encoded_train = tokenizer.batch_encode_plus(      #Batch encodes uses batch encoding- parallel processing
    df[df.data_type == 'train'].text.values,      # Encoding the text column of train
    add_special_tokens = True,                     # Tells BERT when a sentence end and when another one begins
    return_attention_mask = True,                # Identifies the sentence(actual data) in the padded data  
    pad_to_max_length = True,                     # Pads the smaller sentences
    max_length = 256,                             # Here 256 since the tweets are limited to 256
    return_tensors = 'pt'                         # returns in the form of tensors ('pt' = pytorch , 'tf' = tensorflow)
)

#Validation encoder
encoded_valid = tokenizer.batch_encode_plus(      #Batch encodes uses batch encoding- parallel processing
    df[df.data_type == 'valid'].text.values,      # Encoding the text column of valid
    add_special_tokens = True,                     # Tells BERT when a sentence end and when another one begins
    return_attention_mask = True,                # Identifies the sentence(actual data) in the padded data  
    pad_to_max_length = True,                     # Pads the smaller sentences
    max_length = 256,                             # Here 256 since the tweets are limited to 256
    return_tensors = 'pt'                         # returns in the form of tensors ('pt' = pytorch , 'tf' = tensorflow)
)

#Training data

input_ids_train = encoded_train['input_ids']                    #contains the tensor of input ids
attention_masks_train = encoded_train['attention_mask']         #contains the tensor of attention masks

labels_train = torch.tensor(df[df.data_type == 'train'].label.values)   # Converting the label from original data into tensor(train)


#Validation data

input_ids_valid = encoded_valid['input_ids']                    #contains the tensor of input ids
attention_masks_valid = encoded_valid['attention_mask']         #contains the tensor of attention masks

labels_valid = torch.tensor(df[df.data_type == 'valid'].label.values)   # Converting the label from original data into tensor(valid)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
# Creating a Tensor Dataset : standard way of using the pytorch library
#Training dataset
dataset_train = TensorDataset(input_ids_train, attention_masks_train , labels_train)

#Validation dataset
dataset_valid = TensorDataset(input_ids_valid, attention_masks_valid, labels_valid)

print("Length of training dataset:" , len(dataset_train))
print("Length of validation dataset:", len(dataset_valid))

Length of training dataset: 1258
Length of validation dataset: 223


# Setting up BERT Pretrained Model 

In [16]:
from transformers import BertForSequenceClassification

In [17]:
#Loading the bert-base-uncased mode
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',           #using the bert-base-cased . Less computational
    num_labels = len(label_dict),  #Numner of labels for classification
    output_attentions = False,      # Hides the attention output
    output_hidden_states = False   #Hides the hidden state output
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Creating Data Loaders

In [18]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [19]:
#Creating data loaders for train and valid
#Setting batch size to 32. Can adjust
batch_size = 8

dataloader_train = DataLoader(
    dataset_train,
    sampler = RandomSampler(dataset_train),    #Random Sampling the training data
    batch_size = batch_size
)


dataloader_valid = DataLoader(
    dataset_valid,
    sampler = RandomSampler(dataset_valid),    #Random Sampling the validation data
    batch_size = batch_size
)

In [20]:
for batch in dataloader_valid:
    print(batch[2])
    break

tensor([0, 0, 1, 0, 0, 0, 1, 2])


# Optimizer and Scheduler

In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [22]:
#Using adams optimizer
optimizer = AdamW(  
    model.parameters(),  #Passing in the model parameters
    lr= 1e-5,            #The BERT paper suggest anything between 2e-5 to 5e-5
    eps = 1e-8
)

In [23]:
#Scheduler : To alter the learning rate
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,                                            #Passing in Adam optimizer
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs     #Number of steps to cover the whole batch for 10 epochs
)

# Parameter Metrics

In [24]:
import numpy as np
from sklearn.metrics import f1_score

In [25]:
#the output is the predicted probabilites. of format[0.9, 0.05, 0.05,0 , 0.0]
#We want the ouput in the for of binary one hots [1 0 0 0 0 0]

In [36]:
#F1 score
def f1_score_fn(preds, labels):
    pred_flat = np.argmax(preds, axis = 1).flatten()   #flatten to convert list of lists to list
    label_flat= labels.flatten()
    
    return f1_score(label_flat,pred_flat, average = 'weighted')   #Using weighted average due to skewed classes


In [27]:
#Accuracy metrics
def accuracy_fn(preds, labels):
    
    y_pred = np.argmax(preds, axis = 1).flatten()   #flatten to convert list of lists to list
    label_flat= label.flatten()
    
    #Reverse dict to get class labels
    rev_label_dict = dict((v,k) for (k,v) in label.dict.items())
    
    #Printing the accuracy of each class
    for label in np.unique(label_flat):
        y_preds = y_pred[label_flat == label]      #Numpy Indexing #### only the indexes of that particual label are selecetd
                                                     # Works since they are the same length
        y_true = label_flat[label_flat == label]       #getting only the particular class true values
        
        print(f'Class : {rev_label_dict[label]}')
        print(f'Accuracy : {len(y_preds[y_preds == label])/len(y_true)} \n')

# Creating the Training Loop

From HugginFace run_glue.py

In [28]:
#Setting seeds for everything. Important to stay consistent

In [29]:
import random
torch.cuda.empty_cache()
seed = 21
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)        #Use when using CPU
torch.cuda.manual_seed(seed)   #since we are using GPU

In [30]:

#Checking and setting the device to GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
model.to(device)    #Setting torch to use the GPU

print(device)

cuda


In [31]:
def evaluate(dataloader_val):
    
    #Same as training but no back prog. 
    #Set to model eval to stop backprob and freezes all the weights
    model.eval()
    
    loss_val_total = 0     #The total looss 
    predictions, true_vals = [] , []
    
    #Looping through each batch and outputting loss
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)   #Making sure that the values are passed to the GPU
        
        inputs = {
            'input_ids' :        batch[0],
            'attention_mask' :   batch[1],
            'labels' :           batch[2]
        }
        
        #Disbale gradients
        with torch.no_grad():
            outputs = model(**inputs)    #Unpacking the dictionary and passing as params 
        
        loss = outputs[0]   #loss is the first value of output
        logits = outputs[1] #logit is the second value of output
        loss_val_total += loss.item()
        
        #Use logits as prediction
        #Using detach.cpu to pool the values from GPU and use with nump
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        
    loss_val_avg = loss_val_total/len(dataloader_val)    #Average loss per epoch
    
    predictions = np.concatenate(predictions, axis = 0)
    true_vals = np.concatenate(true_vals, axis = 0)
    
    return loss_val_avg, predictions, true_vals


In [37]:
for epoch in tqdm(range(1, epochs+1)):         #Starts from 1
    
    model.train()
    
    loss_train_total = 0
    
    #Progress bar to look at the progress each epoch
    progress_bar = tqdm(dataloader_train,
                       desc = 'Epoch {:1d}'.format(epoch),
                       leave= False,
                       disable= False)
    
    for batch in progress_bar:
        
        model.zero_grad()   #Setting the gradient to 0( Transformers 0, RNN non zero)
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids'      : batch[0],
            'attention_mask' : batch[1],
            'labels'         : batch[2]
        }
        
        outputs = model(**inputs)
        
        loss = outputs[0]   #Loss from output
        loss_train_total += loss.item()   #Appending loss to total loss
        loss.backward()     #Backward Prop
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #Controlling the gradient from going above 1 or very low
        
        optimizer.step()   #Stepping the optimizer
        scheduler.step()   #Stepping the scheduler
        
        #Setting the avg loss values per batch to 3 decimals for the progress bar 
        progress_bar.set_postfix({'training_loss' : '{:.3f}'.format(loss.item()/len(batch))})
        
    #Saving the model each epoch
    torch.save(model.state_dict() , f'Models/Bert_ft_epoch{epoch}.model')    #Saving model for each epoch in models folder
    
    tqdm.write(f'\nEpoch {epoch}')
    
    #Getting average loss per epoch and printing out
    loss_train_avg = loss_train_total/ len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    #Evaluating the model on validation data for the current epoch
    val_loss, predictions, true_vals = evaluate(dataloader_valid)    #passing the validation data to evaluate function
    val_f1 = f1_score_fn(predictions, true_vals)    #Getting the validation F1 score
    tqdm.write(f'Validation loss: {val_loss}')   #Writing out the validation loss
    tqdm.write(f'F1 Score (weighted): {val_f1}') #Writing out the F1 score

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 1
Training loss: 0.16844460349294205
Validation loss: 0.405373378207774
F1 Score (weighted): 0.8780381571449822


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 2
Training loss: 0.11790616776723462
Validation loss: 0.39227869782397257
F1 Score (weighted): 0.8844692662900941


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 3
Training loss: 0.08019513762968628
Validation loss: 0.41154414118500426
F1 Score (weighted): 0.8916633613998916


HBox(children=(HTML(value='Epoch 4'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 4
Training loss: 0.06356846788585704
Validation loss: 0.3917652557803584
F1 Score (weighted): 0.8967332357079949


HBox(children=(HTML(value='Epoch 5'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 5
Training loss: 0.04848076137006754
Validation loss: 0.3959224443200843
F1 Score (weighted): 0.900353447316814


HBox(children=(HTML(value='Epoch 6'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 6
Training loss: 0.04665842020730901
Validation loss: 0.4033178183183606
F1 Score (weighted): 0.8951849852337999


HBox(children=(HTML(value='Epoch 7'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 7
Training loss: 0.04570013484947053
Validation loss: 0.4053219575850692
F1 Score (weighted): 0.8951849852337999


HBox(children=(HTML(value='Epoch 8'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 8
Training loss: 0.0436897082563749
Validation loss: 0.40380466406350024
F1 Score (weighted): 0.8951849852337999


HBox(children=(HTML(value='Epoch 9'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 9
Training loss: 0.04403958103584149
Validation loss: 0.4044759799518423
F1 Score (weighted): 0.8951849852337999


HBox(children=(HTML(value='Epoch 10'), FloatProgress(value=0.0, max=158.0), HTML(value='')))


Epoch 10
Training loss: 0.04349548465578192
Validation loss: 0.4053167658774847
F1 Score (weighted): 0.8951849852337999



# Loading saved model and Evaluating

In [38]:
#Loading the same model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',           #using the bert-base-cased . Less computational
    num_labels = len(label_dict),  #Numner of labels for classification
    output_attentions = False,      # Hides the attention output
    output_hidden_states = False   #Hides the hidden state output
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [42]:
model.load_state_dict(torch.load('Models/Bert_ft_epoch10.model', map_location = torch.device('cpu')))

<All keys matched successfully>

In [43]:
_, predictions, true_vals = evaluate(dataloader_valid)

RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

In [44]:
accuracy_fn(predictions,true_vals)

UnboundLocalError: local variable 'label' referenced before assignment