### 5. Short-list promising models
We expect you to do some additional research and train at **least one model per team member**.

1. Train mainly quick and dirty models from different categories (e.g. linear, SVM, Random Forests etc) using default parameters
2. Measure and compare their performance
3. Analyse the most significant variables for each algorithm
4. Analyse the types of errors the models make
5. Have a quick round of feature selection and engineering if necessary
6. Have one or two more quick iterations of the five previous steps
7. Short-list the top three to five most promising models, preferring models that make different types of errors

In [6]:
# installing dependencies
!pip3 install pandas keras torch tqdm

# !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
!pip3 install torch torchvision torchaudio

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [7]:
# importing the data as dataframes
import pandas as pd

emotions_df_train_ready = pd.read_csv('../data/emotions_train_ready.csv')

pd.set_option('display.max_columns', None)

##### just to not forget the mapping

- 0: 'sadness'
- 1: 'joy'
- 2: 'love'
- 3: 'anger'
- 4: 'fear'
- 5: 'surprise'
- 6: 'neutral'

In [8]:
emotions_df_train_ready.value_counts()

Unnamed: 0  text                                                                                                                                                                                     label
0           Yeah, but nobody takes McD's hamburgers seriously.                                                                                                                                       1        1
399325      That's no hero to me.                                                                                                                                                                    0        1
399309      i didnt grab anything too small and i could still feel how tender it was but positive movement is a very good thing                                                                      2        1
399310      i feel pretty good considering                                                                                                                                   

In [9]:
import re

# Custom tokenization using regular expressions
def custom_tokenize(text):
    # Regular expression for splitting on whitespace and keeping punctuation
    tokens = re.findall(r"\b\w+\b|[!?.]", text)
    return tokens

# Applying custom tokenization to the entire dataset
tokenized_texts = [custom_tokenize(text) for text in emotions_df_train_ready['text']]

# Example of tokenized text
tokenized_texts_example = tokenized_texts[0]
tokenized_texts_example

['Yeah', 'but', 'nobody', 'takes', 'McD', 's', 'hamburgers', 'seriously', '.']

In [10]:
from collections import Counter

# Flatten the list of tokenized texts
all_tokens = [token for text in tokenized_texts for token in text]
vocabulary = set(all_tokens)

# Mapping tokens to integers
token_to_int = {token: i+1 for i, token in enumerate(vocabulary)}


In [11]:
# Define the maximum vocabulary size
max_vocab_size = 10000  # You can adjust this number as needed

# Count the frequency of each token
token_counts = Counter(all_tokens)

# Keep only the most frequent tokens
most_common_tokens = [token for token, count in token_counts.most_common(max_vocab_size)]

# Create a new token-to-integer mapping, including a token for unknown words
token_to_int = {token: i+1 for i, token in enumerate(most_common_tokens)}
token_to_int["<UNK>"] = len(most_common_tokens) + 1

In [12]:
# Encode the tokenized texts using the updated mapping
encoded_sequences = []
for text in tokenized_texts:
    encoded_text = [token_to_int.get(token, token_to_int["<UNK>"]) for token in text]
    encoded_sequences.append(encoded_text)

In [13]:
import numpy as np

# Finding the maximum sequence length
max_seq_length = max(len(seq) for seq in encoded_sequences)

# Padding the sequences
padded_sequences = np.array([seq + [0]*(max_seq_length - len(seq)) for seq in encoded_sequences])

In [14]:
# Extracting the labels
labels = emotions_df_train_ready['label'].values

# One-hot encoding the labels
num_classes = len(np.unique(labels))
one_hot_labels = np.eye(num_classes)[labels]


In [15]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and the rest
X_temp, X_test, y_temp, y_test = train_test_split(
    padded_sequences, one_hot_labels, test_size=0.2, stratify=labels, random_state=42)

# Further splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)  # 0.25 x 0.8 = 0.2



_____

# Training the model

In [24]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [25]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src2 = self.multihead_attn(src, src, src, attn_mask=src_mask,
                                   key_padding_mask=src_key_padding_mask)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src


In [26]:
import math


class TransformerModel(nn.Module):
    def __init__(self, ntoken, d_model, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.transformer_layers = nn.ModuleList([TransformerBlock(d_model, nhead, nhid, dropout) for _ in range(nlayers)])
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, num_classes)
        

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        for mod in self.transformer_layers:
            src = mod(src, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        output = self.decoder(src)
        return output


In [27]:
ntokens = len(most_common_tokens) + 2 # size of vocabulary
d_model = 128 # embedding dimension
nhead = 4     # number of heads in multi-head attention models
nhid = 512   # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4   # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.1 # dropout probability

model = TransformerModel(ntokens, d_model, nhead, nhid, nlayers, dropout)


In [28]:
from torch.utils.data import Dataset, DataLoader

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Instantiate the dataset
dataset = TextDataset(X_train, y_train)

# Create data loaders
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [29]:
import torch.optim as optim

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [30]:
## Switch to M! GPU acceleration 

import torch

# Check if MPS (Apple's M1 GPU) is available and use it; otherwise, use CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device).float()


Using device: mps


In [31]:
# Training

num_epochs = 15

from tqdm import tqdm


# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False):
        nputs = inputs.float()  # Ensure inputs are float32
        labels = labels.long()   # Ensure labels are long

        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Calculate average loss
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}')
    

# Save the model's state dictionary
torch.save(model.state_dict(), 'path_to_save_model.pth')



                                                                 

Epoch 1/15, Average Loss: 4.5185


                                                                 

Epoch 2/15, Average Loss: 4.8812


                                                                       

Epoch 3/15, Average Loss: 0.6640


                                                                     

Epoch 4/15, Average Loss: 0.5019


                                                                       

Epoch 5/15, Average Loss: 0.5003


                                                                       

Epoch 6/15, Average Loss: 0.4817


                                                                       

Epoch 7/15, Average Loss: 0.4574


                                                                       

Epoch 8/15, Average Loss: 0.4568


                                                                   

Epoch 9/15, Average Loss: 0.4606


                                                                        

Epoch 10/15, Average Loss: 0.4376


                                                                    

Epoch 11/15, Average Loss: 0.4225


                                                                      

Epoch 12/15, Average Loss: 0.4356


                                                                  

Epoch 13/15, Average Loss: 0.4249


                                                                      

Epoch 14/15, Average Loss: 0.4251


                                                                    

Epoch 15/15, Average Loss: 0.4255
