In [1]:
# Importing the os module to interact with the operating system
import os

# Listing the contents of the current directory
os.listdir('.')

# Importing pandas for data manipulation and analysis
import pandas as pd

# Importing numpy for numerical computations
import numpy as np

# Importing random for generating random numbers and making choices
import random

# Importing tqdm for displaying progress bars during iterations
from tqdm.notebook import tqdm

# Importing necessary functions and classes from scikit-learn for
# machine learning tasks
from sklearn.model_selection import train_test_split  # For splitting
# data into train and test sets
from sklearn.metrics import f1_score  # For calculating F1 score

# Importing torch for building and training neural networks
import torch

# Importing transformers from Hugging Face for pre-trained models
# and tokenization

import transformers
from transformers import (BertTokenizer,  # For BERT tokenizer
    AutoTokenizer,  # For automatic selection of tokenizer
    BertForSequenceClassification,  # For BERT-based sequence classification model
    AdamW,  # For AdamW optimizer
    get_linear_schedule_with_warmup)  # For learning rate scheduling

# Importing necessary classes from torch.utils.data for handling datasets
from torch.utils.data import (TensorDataset, DataLoader,
                              RandomSampler, SequentialSampler)

In [2]:
from google.colab import files
uploaded = files.upload()

Saving smile-annotations-final.csv to smile-annotations-final (2).csv


In [3]:
df = pd.read_csv('smile-annotations-final.csv', names= ['id', 'text', 'category'])
# Setting the 'id' column as the index of the DataFrame
df.set_index('id', inplace=True)

# Displaying the first few rows of the DataFrame using the 'head' method
display('head', df.head())

# Displaying the counts of unique values in the 'category' column
# using the 'value_counts' method
display('category counts', df.category.value_counts())

# Filtering out rows where the 'category' column contains '|'
df = df[~df.category.str.contains('\|')]

# Filtering out rows where the 'category' column is 'nocode'
df = df[df.category != 'nocode']

# Displaying the counts of unique values in the 'category' column
# after cleanup
display('category counts after cleanup', df.category.value_counts())

# Extracting unique categories from the 'category' column of the DataFrame
possible_labels = df.category.unique()

# Creating a dictionary to map string categories to numerical labels
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

# Creating a new column 'label' in the DataFrame by replacing string categories with numerical labels
df['label'] = df.category.replace(label_dict)

# Displaying the first few rows of the DataFrame with the new 'label' column
df.head()

'head'

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


'category counts'

category
nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: count, dtype: int64

'category counts after cleanup'

category
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


In [4]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.15,
                                                  random_state=17,
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,val,9
disgust,3,train,5
disgust,3,val,1
happy,0,train,966
happy,0,val,171
not-relevant,1,train,182
not-relevant,1,val,32
sad,4,train,27
sad,4,val,5


In [5]:
# Using the BERT tokenizer from the 'bert-base-uncased' model
# and setting do_lower_case to True to ensure all text is lowercased
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

# Encoding the text data in the training set using batch_encode_plus
# This method tokenizes and encodes a batch of sequences, adding special tokens,
# padding the sequences to the same length, and returning PyTorch tensors
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,            # Extracting text data for training
    add_special_tokens=True,                          # Adding special tokens like [CLS] and [SEP]
    return_attention_mask=True,                      # Returning attention masks to focus on actual tokens
    pad_to_max_length=True,                          # Padding sequences to the same length
    max_length=45,                                   # Maximum length of each sequence
    return_tensors='pt'                               # Returning PyTorch tensors
)

# Encoding the text data in the validation set using batch_encode_plus
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,              # Extracting text data for validation
    add_special_tokens=True,                          # Adding special tokens like [CLS] and [SEP]
    return_attention_mask=True,                      # Returning attention masks to focus on actual tokens
    pad_to_max_length=True,                          # Padding sequences to the same length
    max_length=45,                                   # Maximum length of each sequence
    return_tensors='pt'                               # Returning PyTorch tensors
)

# Extracting input IDs, attention masks, and labels for the training set
input_ids_train = encoded_data_train['input_ids']     # Input IDs representing tokenized text
attention_masks_train = encoded_data_train['attention_mask']  # Attention masks indicating which tokens to attend to
labels_train = torch.tensor(df[df.data_type=='train'].label.values)  # Labels for the training set

# Extracting input IDs, attention masks, and labels for the validation set
input_ids_val = encoded_data_val['input_ids']         # Input IDs representing tokenized text
attention_masks_val = encoded_data_val['attention_mask']   # Attention masks indicating which tokens to attend to
labels_val = torch.tensor(df[df.data_type=='val'].label.values)   # Labels for the validation set

# Creating PyTorch datasets for training and validation
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)  # Training dataset
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
# Initializing the BERT model for sequence classification from the pre-trained 'bert-base-uncased' model
# Specifying the number of labels in the output layer based on the length of the label dictionary
# Setting output_attentions and output_hidden_states to False to exclude additional outputs
# Setting resume_download to True to resume download if interrupted
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      resume_download=True)

# Defining the batch size for training and validation
batch_size = 32

# Creating data loaders for training and validation sets
# Using RandomSampler for training data and SequentialSampler for validation data
dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

# Initializing the AdamW optimizer with the BERT model parameters
# Setting the learning rate to 2e-5 and epsilon to 1e-8
optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8)

# Defining the number of epochs for training
epochs = 7

# Creating a linear scheduler with warmup for adjusting learning rates during training
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

# Defining a function to calculate the F1 score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Defining a function to calculate accuracy per class
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
### assigning seed to be able to reproduce results ###
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Checking for GPU availability and assigning the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Moving the model to the selected device
print(device)  # Printing the device (GPU or CPU) being used

# Defining the evaluation function for the validation set
def evaluate(dataloader_val):

    model.eval()  # Setting the model to evaluation mode

    loss_val_total = 0  # Initializing total validation loss
    predictions, true_vals = [], []  # Lists to store predictions
                                      # and true values

    # Iterating through batches in the validation dataloader
    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)  # Moving batch
                                                    # tensors to the device

        inputs = {'input_ids':      batch[0],      # Input token IDs
                  'attention_mask': batch[1],      # Attention masks
                  'labels':         batch[2],      # Labels
                 }

        with torch.no_grad():  # Disabling gradient calculation
            outputs = model(**inputs)  # Forward pass

        loss = outputs[0]  # Extracting loss value from the output
        logits = outputs[1]  # Predicted logits
        loss_val_total += loss.item()  # Accumulating validation loss

        logits = logits.detach().cpu().numpy()  # Detaching logits from
                                          # computation graph and moving to CPU
        label_ids = inputs['labels'].cpu().numpy()  # Moving label IDs to CPU
        predictions.append(logits)  # Appending predictions to the list
        true_vals.append(label_ids)  # Appending true values to the list

    loss_val_avg = loss_val_total/len(dataloader_val)  # Calculating
                                                      # average validation loss

    # Concatenating predictions and true values to form arrays
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals  # Returning validation
                                            # loss, predictions, and true values

cpu


In [8]:
# Training loop for each epoch
for epoch in tqdm(range(1, epochs+1)):

    model.train()  # Setting the model to training mode

    loss_train_total = 0  # Initializing total training loss

    # Progress bar for training epoch
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()  # Resetting gradients

        batch = tuple(b.to(device) for b in batch)  # Moving batch tensors to the device

        inputs = {'input_ids':      batch[0],      # Input token IDs
                  'attention_mask': batch[1],      # Attention masks
                  'labels':         batch[2],      # Labels
                 }

        outputs = model(**inputs)  # Forward pass

        loss = outputs[0]  # Extracting loss value from the output
        loss_train_total += loss.item()  # Accumulating training loss
        loss.backward()  # Backpropagation

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Clipping gradients to prevent explosion

        optimizer.step()  # Optimizer step
        scheduler.step()  # Scheduler step

        # Updating progress bar with current training loss
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')  # Saving model after each epoch

    tqdm.write(f'\nEpoch {epoch}')  # Printing current epoch

    loss_train_avg = loss_train_total/len(dataloader_train)  # Calculating average training loss
    tqdm.write(f'Training loss: {loss_train_avg}')  # Printing training loss

    val_loss, predictions, true_vals = evaluate(dataloader_validation)  # Evaluating on validation set
    val_f1 = f1_score_func(predictions, true_vals)  # Calculating F1 score
    tqdm.write(f'Validation loss: {val_loss}')  # Printing validation loss
    tqdm.write(f'F1 Score (Weighted): {val_f1}')  # Printing F1 score

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.8727099262177944
Validation loss: 0.6960924778665815
F1 Score (Weighted): 0.6953185953656175


Epoch 2:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.5890399940311909
Validation loss: 0.5608999729156494
F1 Score (Weighted): 0.7776803696809484


Epoch 3:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.41431066431105135
Validation loss: 0.5553505931581769
F1 Score (Weighted): 0.7840917444681321


Epoch 4:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.30998623576015233
Validation loss: 0.576773824436324
F1 Score (Weighted): 0.8528883914115875


Epoch 5:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.23277693546842784
Validation loss: 0.5301558460508075
F1 Score (Weighted): 0.8485387351167465


Epoch 6:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.17872437173500658
Validation loss: 0.5902683181422097
F1 Score (Weighted): 0.8376312688230505


Epoch 7:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.15292004803195597
Validation loss: 0.5787305299724851
F1 Score (Weighted): 0.8431057420214566


In [12]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)
model.load_state_dict(torch.load('finetuned_BERT_epoch_4.model', map_location=torch.device('cpu')))
_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class: happy
Accuracy: 168/171

Class: not-relevant
Accuracy: 17/32

Class: angry
Accuracy: 8/9

Class: disgust
Accuracy: 0/1

Class: sad
Accuracy: 0/5

Class: surprise
Accuracy: 2/5

