# Exercise 3 - Emotion Recognition

## 1. Initial Set-Up

### 1.1. Import libraries



In [None]:
!pip install emoji
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np
import string
import re
import emoji
from textblob import TextBlob
from collections import Counter, OrderedDict

import os
import tqdm
import nltk
nltk.download("all")
import matplotlib.pyplot as plt
import random
import time
import copy

from nltk.tokenize import word_tokenize
from collections import defaultdict
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Pac

### 1.2. Download fastText Word Vectors

The pretrained word vectors used in the original paper are *word2vec* (Mikolov et al., 2013). They are trained on 100 billion tokens from Google News. In this tutorial, we will use [*fastText* pretrained word vectors](https://fasttext.cc/docs/en/english-vectors.html) (Mikolov et al., 2017), trained on 600 billion tokens from Common Crawl. *fastText* is an upgraded version of *word2vec*. It outperforms other state-of-the-art methods by a large margin.

The code below downloads the fastText pretrained vectors. 

In [None]:
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

fastText exists.


### 1.4. Set up GPU for Training

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


### 1.5. Set seed for reproductivity

In [None]:
def set_seed(seed_value=42):
    """Set seed for reproducibility."""

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed()

## 2. Data Preparation and Preprocessing

### 2.1. Fetch the data from the github repository

This is the class to import and preprocess both training and test(ing) data. 

In order to clean up the data, for each text, we decided to do the following steps:


1.   Remove the html tags
2.   Convert all letters to lower case
3.   Remove all punctuation
4.   Convert all emojis to words (We decided to do this, because the emojis may contain decisive/important information about different emtions)



In [None]:
class Dataset():
    # Initialise
    def __init__(self,task_url):
        self.task_url=task_url
        self.train_df_full=None
        self.test_df_full=None
        self.mapping_dict=None
        self.train_texts=None
        self.test_texts=None
        self.train_labels=None
        self.test_labels=None
    
    # Load the data from the github project
    def load_data(self):
        train_text_df_raw=pd.read_csv(self.task_url+"/train_text.txt", sep ="/n", engine ='python', header = None, names=['text'])
        val_text_df_raw=pd.read_csv(self.task_url+"/val_text.txt", sep ="/n", engine ='python', header = None, names=['text'])
        test_text_df_raw=pd.read_csv(self.task_url+"/test_text.txt", sep ="/n", engine ='python', header = None, names=['text'])
        train_label_df=pd.read_csv(self.task_url+"/train_labels.txt", sep ="/n", engine ='python', header = None, names=['label'])
        val_label_df=pd.read_csv(self.task_url+"/val_labels.txt", sep ="/n", engine ='python', header = None, names=['label'])
        test_label_df=pd.read_csv(self.task_url+"/test_labels.txt", sep ="/n", engine ='python', header = None, names=['label'])
        self.train_df_full=pd.concat([train_text_df_raw,train_label_df],axis=1)
        self.val_df_full=pd.concat([val_text_df_raw,val_label_df],axis=1)
        self.test_df_full=pd.concat([test_text_df_raw,test_label_df],axis=1)
        mapping_df=pd.read_csv(self.task_url+"/mapping.txt", sep ="\t", header = None, names=['label','meaning'])
        self.mapping_dict={}
        for l in mapping_df.label:
          self.mapping_dict[l]=mapping_df[mapping_df['label']==l]['meaning'].values[0]
        avaliable_str=""
        for k in self.mapping_dict.keys():
            avaliable_str+=str(k)+":"+self.mapping_dict[k]+" "
        print("Classes avaliable: "+avaliable_str)
    
    # Text preprossesing functions

    # Remove any html tags
    def remove_html_tags(self, text):
        re_html = re.compile('<.*?>')
        return re_html.sub(r'', text)


    # Convert all characters to lower-case
    def convert_lowercase(self, text):
        return text.lower()

    # Remove any html tags
    def remove_numbers(self, text):
        return re.sub(r"\d+", "", text)
    # Remove all punctuation
    def remove_punc(self, text):
        x=''
        for i in range(len(text)):
            if text[i] in string.punctuation:
                if i!=0: 
                    if text[i-1]!=" ":
                        x=x+" "
            else:
                x=x+text[i]
        return x

    # Convert all emojis to text
    def convert_emoji(self, text):
        return emoji.demojize(text)
    
    # Check for any spelling mistakes
    def spelling_correction(self, text):
        textblob_ = TextBlob(text)
        return textblob_.correct().string
    
    # Apply the functions above
    def preprocessing(self, df):
        df=df.apply(self.remove_html_tags)
        df=df.apply(self.convert_lowercase)
        df=df.apply(self.convert_emoji)
        df=df.apply(self.remove_numbers)
        df=df.apply(self.remove_punc)
        #df=df.apply(self.spelling_correction)
        return df

    def generate_dataset(self, class1, class2):
        #Choose two labels to generate the train and test dataset
        self.load_data()
        print("Selected classes: "+str(self.mapping_dict[class1])+", "+str(self.mapping_dict[class2]))
        print("Labels: 0, 1")
        train_df_selected=pd.concat([self.train_df_full[self.train_df_full['label']==class1], self.train_df_full[self.train_df_full['label']==class2]]).sample(frac=1).reset_index(drop=True)
        val_df_selected=pd.concat([self.val_df_full[self.val_df_full['label']==class1], self.val_df_full[self.val_df_full['label']==class2]]).sample(frac=1).reset_index(drop=True)
        test_df_selected=pd.concat([self.test_df_full[self.test_df_full['label']==class1], self.test_df_full[self.test_df_full['label']==class2]]).sample(frac=1).reset_index(drop=True)
        train_df_selected['text']=self.preprocessing(train_df_selected['text'])
        val_df_selected['text']=self.preprocessing(val_df_selected['text'])
        test_df_selected['text']=self.preprocessing(test_df_selected['text'])
        transfer_dict={class1:0,class2:1}
        self.train_texts, self.train_labels=train_df_selected['text'].values,train_df_selected['label'].values
        self.val_texts, self.val_labels=val_df_selected['text'].values,val_df_selected['label'].values
        self.test_texts, self.test_labels=test_df_selected['text'].values,test_df_selected['label'].values
        self.train_labels, self.val_labels, self.test_labels = self.replace_with_dict(self.train_labels, transfer_dict), self.replace_with_dict(self.val_labels, transfer_dict), self.replace_with_dict(self.test_labels, transfer_dict)
        self.label_meaning={0:self.mapping_dict[class1],1:self.mapping_dict[class2]}
        return self.train_texts,self.train_labels,self.val_texts,self.val_labels,self.test_texts,self.test_labels,self.label_meaning

    def replace_with_dict(self, ar, dic):
        k = np.array(list(dic.keys()))
        v = np.array(list(dic.values()))
        sidx = k.argsort()
        return v[sidx[np.searchsorted(k,ar,sorter=sidx)]]

The first dataset contains the *anger (label 0)* and *joy (label 1)* emotions, and the second contains *anger (label 0)* and *optimism (label 1)* emotions:

In [None]:
task_url = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion"
dataset = Dataset(task_url)
train_texts_1,train_labels_1,val_texts_1,val_labels_1,test_texts_1,test_labels_1,label_meaning_1 = dataset.generate_dataset(0,1)
train_texts_2,train_labels_2,val_texts_2,val_labels_2,test_texts_2,test_labels_2,label_meaning_2 = dataset.generate_dataset(0,2)

Classes avaliable: 0:anger 1:joy 2:optimism 3:sadness 
Selected classes: anger, joy
Labels: 0, 1
Classes avaliable: 0:anger 1:joy 2:optimism 3:sadness 
Selected classes: anger, optimism
Labels: 0, 1


### 2.2. Create the training and test(ing) data loader for the model.


To prepare our text data for training, we need to tokenize our sentences and build a vocabulary dictionary word2idx. The dictionary will later be used to convert our tokens into indexes and build an embedding layer.

Below are the steps we perfom:


1.   Tokenize

> The function "tokenize" tokenizes our sentences, in order to build a vocabulary and find the maximum sentence length. The function "encode" takes in the outputs of tokenize, performs sentence padding and stores input_ids as a numpy array in the class.

2.   Load Pretrained Vectors

> We load the pretrained vectors for each token in our vocabulary. For tokens without pretraiend vectors, we initialize random word vectors with the same length and variance.


3.   Create PyTorch DataLoader for Train, Validation and Test Datasets

> We create an iterator for our dataset using the torch DataLoader class. This will help us minimize memory-use during training and boost the training speed. Tokenize the sentences and convert them into vectors according to the saved *words2idx* dictionary. If the length of the test senetence happens to be n-words longer than the max length, the sentence is shortened, by ignoring the last n-words.





In [None]:
class TextVector():
    def __init__(self,train_texts,val_texts,train_labels,val_labels):
        self.train_texts=train_texts
        self.train_labels=train_labels
        self.val_texts=val_texts
        self.val_labels=val_labels
        self.train_texts_full=np.hstack([train_texts,val_texts])
        self.train_labels_full=np.hstack([train_labels,val_labels])                              
        self.tokenized_texts=[]
        self.word2idx={}
        self.max_len=0
        self.input_ids=None
        self.embeddings=None

    def tokenize(self):
        self.word2idx['<pad>'] = 0
        self.word2idx['<unk>'] = 1

        idx = 2
        for sent in self.train_texts_full:
            tokenized_sent = word_tokenize(sent)

            # Add `tokenized_sent` to `tokenized_texts`
            self.tokenized_texts.append(tokenized_sent)

            # Add new token to `word2idx`
            for token in tokenized_sent:
                if token not in self.word2idx:
                    self.word2idx[token] = idx
                    idx += 1

            # Update `max_len`
            self.max_len = max(self.max_len, len(tokenized_sent))

    def encode_train(self):
        input_ids = []
        for tokenized_sent in self.tokenized_texts:
        # Pad sentences to max_len
            tokenized_sent += ['<pad>'] * (self.max_len - len(tokenized_sent))

        # Encode tokens to input_ids
            input_id = [self.word2idx.get(token) for token in tokenized_sent]
            input_ids.append(input_id)
    
        self.input_ids=np.array(input_ids)

    def load_pretrained_vectors(self, fname):
        fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        n, d = map(int, fin.readline().split())

        # Initilize random embeddings
        self.embeddings = np.random.uniform(-0.25, 0.25, (len(self.word2idx), d))
        self.embeddings[self.word2idx['<pad>']] = np.zeros((d,))

        # Load pretrained vectors
        count = 0
        for line in tqdm.notebook.tqdm(fin):
            tokens = line.rstrip().split(' ')
            word = tokens[0]
            if word in self.word2idx:
                count += 1
                self.embeddings[self.word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

        print(f"There are {count} / {len(self.word2idx)} pretrained vectors found.")
    
    def create_train_data_loader(self, batch_size=50):
        train_inputs=[]
        for sent in self.train_texts:
            tokenized_sent = word_tokenize(sent)
            padded_tokens = tokenized_sent + ['<pad>'] * (self.max_len - len(tokenized_sent))
            train_input_id = [self.word2idx.get(token, self.word2idx['<unk>']) for token in padded_tokens]
            train_inputs.append(train_input_id)
        train_inputs = torch.tensor(train_inputs)
        train_labels = torch.tensor(self.train_labels)

        train_data = TensorDataset(train_inputs, train_labels)
        train_sampler = RandomSampler(train_data)
        self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        return self.train_dataloader

    def create_val_data_loader(self, batch_size=50):
        val_inputs=[]
        for sent in self.val_texts:
            tokenized_sent = word_tokenize(sent)
            padded_tokens = tokenized_sent + ['<pad>'] * (self.max_len - len(tokenized_sent))
            val_input_id = [self.word2idx.get(token, self.word2idx['<unk>']) for token in padded_tokens]
            val_inputs.append(val_input_id)
        val_inputs = torch.tensor(val_inputs)
        val_labels = torch.tensor(self.val_labels)

        val_data = TensorDataset(val_inputs, val_labels)
        val_sampler = RandomSampler(val_data)
        self.val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
        return self.val_dataloader

    
    def create_data_loader(self, pretrained_model_name, batch_size=50):
        self.tokenize()
        self.encode_train()
        self.load_pretrained_vectors(pretrained_model_name)
        self.train_dataloader=self.create_train_data_loader(batch_size=batch_size)
        self.val_dataloader=self.create_val_data_loader(batch_size=batch_size)
        self.embeddings = torch.tensor(self.embeddings)
        return self.embeddings, self.train_dataloader, self.val_dataloader

    def create_test_data_loader(self, test_texts, test_labels, batch_size=50):
        test_inputs=[]
        for sent in test_texts:
            tokenized_sent = word_tokenize(sent)
            if self.max_len>=len(tokenized_sent):
                padded_tokens = tokenized_sent + ['<pad>'] * (self.max_len - len(tokenized_sent))
            else:
                padded_tokens = tokenized_sent[:self.max_len]
                print("Some words are deleted")
            test_input_id = [self.word2idx.get(token, self.word2idx['<unk>']) for token in padded_tokens]
            test_inputs.append(test_input_id)
        test_inputs = torch.tensor(test_inputs)
        test_labels = torch.tensor(test_labels)

        test_data = TensorDataset(test_inputs, test_labels)
        test_sampler = RandomSampler(test_data)
        self.test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
        return self.test_dataloader


    def get_word2idx(self):
        return self.word2idx



In [None]:
textVector_1=TextVector(train_texts_1,val_texts_1,train_labels_1,val_labels_1)
embeddings_1,train_data_loader_1,val_data_loader_1=textVector_1.create_data_loader("fastText/crawl-300d-2M.vec")
test_data_loader_1 = textVector_1.create_test_data_loader(test_texts_1,test_labels_1)

0it [00:00, ?it/s]

There are 6351 / 7080 pretrained vectors found.
Some words are deleted
Some words are deleted
Some words are deleted


In [None]:
textVector_2=TextVector(train_texts_2,val_texts_2,train_labels_2,val_labels_2)
embeddings_2,train_data_loader_2,val_data_loader_2=textVector_2.create_data_loader("fastText/crawl-300d-2M.vec")
test_data_loader_2 = textVector_2.create_test_data_loader(test_texts_2,test_labels_2)

0it [00:00, ?it/s]

There are 5442 / 5953 pretrained vectors found.


## 3. Training and testing the model

### 3.1. Model structure

We built our model upon this existing model. Please check the link to see how it works. Source: [notebook](https://chriskhanhtran.github.io/posts/cnn-sentence-classification/)

In [None]:
class CNN_NLP(nn.Module):
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 stride_sizes=[1, 1, 1],
                 num_classes=2,
                 dropout=0.5):

        super(CNN_NLP, self).__init__()
        # Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_dim, padding_idx=0, max_norm=5.0)
        # Convolutional Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim, out_channels=num_filters[i], kernel_size=filter_sizes[i], stride = stride_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, input_ids):
        
        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

### 3.2. Training and Testing Class

1.  Initialize the model

> We decided to use a model which contains only 1 convolutional layer. We also tried different kinds of optimizers. Besides Adadelta, we also tried SGD and Adam.


2. Training epochs

> For each epoch, the code below will perform a forward step to compute the Cross Entropy loss, a backward step to compute gradients and use the optimizer to update weights/parameters. The best model will be updated according to the performance of predicting validation datasets.


3. Evaluation

> Use the best models to predict the test dataset and compare the performance.





In [None]:
class Training():
    def __init__(self):
        self.cnn_model=None
        self.loss_fn=None
        self.optimizer=None
        self.best_cnn_model=None

    def initilize_model(self, pretrained_embedding=None,
                    freeze_embedding=False,
                    vocab_size=None,
                    embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    stride_sizes=[1, 1, 1],
                    num_classes=2,
                    dropout=0.5,
                    optimizer="Adadelta",
                    learning_rate=0.01):
        """Instantiate a CNN model and an optimizer."""

        assert (len(filter_sizes) == len(num_filters)), "filter_sizes and \
        num_filters need to be of the same length."

        # Instantiate CNN model
        self.cnn_model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=embed_dim,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        stride_sizes=stride_sizes,
                        num_classes=2,
                        dropout=0.5)
    
        # Send model to `device` (GPU/CPU)
        self.cnn_model.to(device)

        # Instantiate Adadelta optimizer
        optim_dict = {"Adam" : optim.Adam(self.cnn_model.parameters(), lr=learning_rate),
                "Adadelta": optim.Adadelta(self.cnn_model.parameters(), lr=learning_rate, rho=0.95),
                "SGD" : optim.SGD(self.cnn_model.parameters(), lr=learning_rate),
                "SGD_Momentum" : optim.SGD(self.cnn_model.parameters(), lr=learning_rate, momentum=0.8)}

        self.optimizer = optim_dict[optimizer]
        self.loss_fn=nn.CrossEntropyLoss()

    def train (self, train_dataloader, val_dataloader=None, epochs=10):
        """Train the CNN model."""
    
        # Tracking best validation accuracy
        best_accuracy = 0

        # Start training loop
        print("Start training...\n")
        print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*60)

        for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================

        # Tracking time and loss
            t0_epoch = time.time()
            total_loss = 0

            # Put the model into the training mode
            self.cnn_model.train()

            for step, batch in enumerate(train_dataloader):
            # Load batch to GPU
                b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
                self.cnn_model.zero_grad()

            # Perform a forward pass. This will return logits.
                logits = self.cnn_model(b_input_ids)

            # Compute loss and accumulate the loss values
                loss = self.loss_fn(logits, b_labels)
                total_loss += loss.item()

            # Perform a backward pass to calculate gradients
                loss.backward()

            # Update parameters
                self.optimizer.step()

        # Calculate the average loss over the entire training data
            avg_train_loss = total_loss / len(train_dataloader)

        # =======================================
        #               Evaluation
        # =======================================
            if val_dataloader is not None:
            # After the completion of each training epoch, measure the model's
            # performance on our validation set.
                val_loss, val_accuracy, _ = self.evaluate(val_dataloader)

            # Track the best accuracy
                if val_accuracy > best_accuracy:
                    best_accuracy = val_accuracy
                    self.best_cnn_model = copy.copy(self.cnn_model)

            # Print performance over the entire training data
                time_elapsed = time.time() - t0_epoch
                if (epoch_i+1)%10==0:
                    print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.4f} | {time_elapsed:^9.2f}")
            
        print("\n")
        print(f"Training complete! Best accuracy: {best_accuracy:.4f}.")

    def evaluate(self, val_dataloader):
        """After the completion of each training epoch, measure the model's
        performance on our validation set.
        """
        # Put the model into the evaluation mode. The dropout layers are disabled
        # during the test time.
        self.cnn_model.eval()

        # Tracking variables
        val_loss = []
        prob_all = []
        label_all = []
        # For each batch in our validation set...
        for batch in val_dataloader:
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Compute logits
            with torch.no_grad():
                logits = self.cnn_model(b_input_ids)

            # Compute loss
            loss = self.loss_fn(logits, b_labels)
            val_loss.append(loss.item())

            # Get the predictions
            preds = torch.argmax(logits, dim=1).flatten()
            preds.cpu().numpy()

            prob_all.extend(preds)
            label_all.extend(b_labels)

        # Compute the average accuracy and loss over the validation set.
        val_loss = np.mean(val_loss)
        val_accuracy = accuracy_score(prob_all, label_all)
        val_f1 = f1_score(prob_all, label_all)

        return val_loss, val_accuracy, val_f1
    
    def predict_accuracy(self, test_dataloader):
        self.best_cnn_model.eval()

        test_loss = []
        prob_all = []
        label_all = []
        for batch in test_dataloader:
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                logits = self.best_cnn_model(b_input_ids)

            loss = self.loss_fn(logits, b_labels)
            test_loss.append(loss.item())

            preds = torch.argmax(logits, dim=1).flatten()
            preds.cpu().numpy()

            prob_all.extend(preds)
            label_all.extend(b_labels)

        # Compute the average accuracy and loss over the validation set.
        test_loss = np.mean(test_loss)
        test_accuracy = accuracy_score(prob_all, label_all)
        test_f1 = f1_score(prob_all, label_all, average='macro')

        print(f"test accuracy: {test_accuracy:.4f} and test f1 score: {test_f1:.4f}" )
        print("\n")
        return test_accuracy
        
    def get_best_model(self):
        return self.best_cnn_model

Here's the performance of the different combinations of hyprtparameters. For each combination there's a table to describe how it is derived.

### 3.3. Performance of different combinations of hyperparameters

The first and second variations of hyper-parameters.

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |Adam         |
|learning rate      |0.01              |
|dropout rate        |0              |
|number of filters         |1  |
|strides         |1  |
|filter sizes        |3        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |
<br>

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |Adam         |
|learning rate      |0.01              |
|dropout rate        |0              |
|number of filters         |100  |
|strides         |1  |
|filter sizes        |3        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |

In [None]:
training_1=Training()
training_1.initilize_model(pretrained_embedding=embeddings_1, freeze_embedding=True, filter_sizes=[3], num_filters=[1], stride_sizes=[1], optimizer="Adam", learning_rate=0.01, dropout=0)
training_1.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_1.predict_accuracy(test_data_loader_1)
training_2=Training()
training_2.initilize_model(pretrained_embedding=embeddings_1, freeze_embedding=True, filter_sizes=[3], num_filters=[100], stride_sizes=[1], optimizer="Adam", learning_rate=0.01, dropout=0)
training_2.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_2.predict_accuracy(test_data_loader_1)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.510801   |  0.453511  |  0.7860   |   0.53   
  20    |   0.489540   |  0.478497  |  0.7704   |   0.57   
  30    |   0.474236   |  0.485172  |  0.7782   |   0.54   
  40    |   0.456524   |  0.521283  |  0.7860   |   0.53   
  50    |   0.483335   |  0.546143  |  0.7821   |   0.54   
  60    |   0.467481   |  0.563923  |  0.7782   |   0.56   
  70    |   0.473557   |  0.537676  |  0.7782   |   0.54   
  80    |   0.458284   |  0.564255  |  0.7743   |   0.52   
  90    |   0.460836   |  0.623912  |  0.7782   |   0.59   
  100   |   0.486665   |  0.567716  |  0.7626   |   0.55   


Training complete! Best accuracy: 0.7977.
test accuracy: 0.6157 and test f1 score: 0.6157


Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.071144   |  

0.8820960698689956

The third, fourth and fifth variations of hyper-parameters.
<br>

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |SGD with Momentum         |
|learning rate      |0.01              |
|dropout rate        |0              |
|number of filters         |100  |
|strides         |1  |
|filter sizes        |3        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |
<br>

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |SGD         |
|learning rate      |0.01              |
|dropout rate        |0              |
|number of filters         |100  |
|strides         |1  |
|filter sizes        |3        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |
<br>

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |Adadelta        |
|learning rate      |0.01              |
|dropout rate        |0              |
|number of filters         |100  |
|strides         |1  |
|filter sizes        |3        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |

In [None]:
training_3=Training()
training_3.initilize_model(pretrained_embedding=embeddings_1, freeze_embedding=True, filter_sizes=[3], num_filters=[100], stride_sizes=[1], optimizer="SGD_Momentum", learning_rate=0.01, dropout=0)
training_3.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_3.predict_accuracy(test_data_loader_1)
training_4=Training()
training_4.initilize_model(pretrained_embedding=embeddings_1, freeze_embedding=True, filter_sizes=[3], num_filters=[100], stride_sizes=[1], optimizer="SGD", learning_rate=0.01, dropout=0)
training_4.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_4.predict_accuracy(test_data_loader_1)
training_5=Training()
training_5.initilize_model(pretrained_embedding=embeddings_1, freeze_embedding=True, filter_sizes=[3], num_filters=[100], stride_sizes=[1], optimizer="Adadelta", learning_rate=0.01, dropout=0)
training_5.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_5.predict_accuracy(test_data_loader_1)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.217044   |  0.228026  |  0.8911   |   1.52   
  20    |   0.112095   |  0.201087  |  0.9027   |   1.47   
  30    |   0.064714   |  0.178123  |  0.9066   |   2.48   
  40    |   0.042130   |  0.190649  |  0.9066   |   1.50   
  50    |   0.027740   |  0.183583  |  0.9105   |   1.50   
  60    |   0.023786   |  0.321535  |  0.9066   |   1.48   
  70    |   0.016630   |  0.189908  |  0.9066   |   1.52   
  80    |   0.016877   |  0.198176  |  0.9066   |   1.47   
  90    |   0.013401   |  0.299020  |  0.9105   |   1.49   
  100   |   0.010439   |  0.206971  |  0.9066   |   1.48   


Training complete! Best accuracy: 0.9183.
test accuracy: 0.9148 and test f1 score: 0.8904


Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.560705   |  

0.8951965065502183

The sixth variation of hyper-parameters.

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |SGD with Momentum        |
|learning rate      |0.01              |
|dropout rate        |0              |
|number of filters         |100  |
|strides         |3  |
|filter sizes        |3        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |

In [None]:
training_6=Training()
training_6.initilize_model(pretrained_embedding=embeddings_1, filter_sizes=[3], num_filters=[100], stride_sizes=[3], optimizer="SGD_Momentum", freeze_embedding=True, learning_rate=0.01, dropout=0)
training_6.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_6.predict_accuracy(test_data_loader_1)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.263981   |  0.284764  |  0.8638   |   0.93   
  20    |   0.122316   |  0.269566  |  0.8755   |   0.93   
  30    |   0.069812   |  0.259845  |  0.8716   |   0.91   
  40    |   0.046632   |  0.230985  |  0.8833   |   0.92   
  50    |   0.034384   |  0.325237  |  0.8872   |   0.90   
  60    |   0.023573   |  0.276206  |  0.8794   |   0.92   
  70    |   0.019636   |  0.265333  |  0.8794   |   0.91   
  80    |   0.016959   |  0.276330  |  0.8755   |   0.93   
  90    |   0.013966   |  0.315720  |  0.8677   |   0.91   
  100   |   0.012474   |  0.276041  |  0.8716   |   1.22   


Training complete! Best accuracy: 0.8949.
test accuracy: 0.9007 and test f1 score: 0.8745




0.9006550218340611

The seventh variation of hyper-parameters.

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |SGD with Momentum        |
|learning rate      |0.01              |
|dropout rate        |0              |
|number of filters         |100  |
|strides         |1  |
|filter sizes        |7        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |

In [None]:
training_7=Training()
training_7.initilize_model(pretrained_embedding=embeddings_1, filter_sizes=[7], num_filters=[100], stride_sizes=[1], optimizer="SGD_Momentum", freeze_embedding=True, learning_rate=0.01, dropout=0)
training_7.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_7.predict_accuracy(test_data_loader_1)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.161837   |  0.290545  |  0.8677   |   2.47   
  20    |   0.057989   |  0.290964  |  0.8949   |   3.07   
  30    |   0.028067   |  0.205796  |  0.9027   |   3.33   
  40    |   0.018525   |  0.330657  |  0.8911   |   2.99   
  50    |   0.013823   |  0.224414  |  0.9027   |   2.49   
  60    |   0.010446   |  0.215805  |  0.9027   |   2.47   
  70    |   0.008988   |  0.244130  |  0.8988   |   2.43   
  80    |   0.006654   |  0.220599  |  0.9066   |   3.36   
  90    |   0.005145   |  0.236821  |  0.9066   |   2.43   
  100   |   0.005987   |  0.241725  |  0.9066   |   2.45   


Training complete! Best accuracy: 0.9144.
test accuracy: 0.8985 and test f1 score: 0.8696




0.898471615720524

The eighth and final variation of hyper-parameters.

|Hyperparameters         |Values           |
|:------------------:|:---------------:|
|optimizer  |SGD with Momentum         |
|learning rate      |0.01              |
|dropout rate        |0.2              |
|number of filters         |100  |
|strides         |1  |
|filter sizes        |3        |
|pooling             |1-max pooling    |
|batch size        |50              |
|number of epochs        |100              |

In [None]:
training_8=Training()
training_8.initilize_model(pretrained_embedding=embeddings_1, filter_sizes=[3], num_filters=[100], stride_sizes=[1], optimizer="SGD_Momentum", freeze_embedding=True, learning_rate=0.01, dropout=0.2)
training_8.train(train_data_loader_1, val_data_loader_1, epochs=100)
training_8.predict_accuracy(test_data_loader_1)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.225622   |  0.236081  |  0.9027   |   2.14   
  20    |   0.110173   |  0.237622  |  0.9066   |   1.49   
  30    |   0.065563   |  0.222239  |  0.9027   |   1.50   
  40    |   0.045893   |  0.242006  |  0.9027   |   1.52   
  50    |   0.035183   |  0.189779  |  0.8949   |   1.50   
  60    |   0.020420   |  0.320396  |  0.9027   |   1.47   
  70    |   0.020440   |  0.350254  |  0.8949   |   1.48   
  80    |   0.016241   |  0.205847  |  0.9027   |   1.53   
  90    |   0.015839   |  0.283696  |  0.8988   |   1.53   
  100   |   0.014711   |  0.265655  |  0.9027   |   1.51   


Training complete! Best accuracy: 0.9105.
test accuracy: 0.9083 and test f1 score: 0.8827




0.9082969432314411

Use the best combination (variation number 3) to work on the second dataset:

In [None]:
training_9=Training()
training_9.initilize_model(pretrained_embedding=embeddings_1, freeze_embedding=True, filter_sizes=[3], num_filters=[100], stride_sizes=[1], optimizer="SGD_Momentum", learning_rate=0.01, dropout=0)
training_9.train(train_data_loader_2, val_data_loader_2, epochs=100)
training_9.predict_accuracy(test_data_loader_2)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
  10    |   0.401124   |  0.397634  |  0.8511   |   1.76   
  20    |   0.262505   |  0.374344  |  0.8457   |   1.79   
  30    |   0.144861   |  0.366655  |  0.8564   |   1.72   
  40    |   0.081940   |  0.381896  |  0.8564   |   1.73   
  50    |   0.049589   |  0.389657  |  0.8617   |   1.78   
  60    |   0.041792   |  0.407975  |  0.8617   |   1.76   
  70    |   0.028881   |  0.434633  |  0.8617   |   1.75   
  80    |   0.027297   |  0.399348  |  0.8670   |   1.75   
  90    |   0.019540   |  0.447802  |  0.8670   |   1.74   
  100   |   0.018514   |  0.420162  |  0.8670   |   1.77   


Training complete! Best accuracy: 0.8670.
test accuracy: 0.8443 and test f1 score: 0.2933




0.8443465491923642