### 5. Short-list promising models
We expect you to do some additional research and train at **least one model per team member**.

1. Train mainly quick and dirty models from different categories (e.g. linear, SVM, Random Forests etc) using default parameters
2. Measure and compare their performance
3. Analyse the most significant variables for each algorithm
4. Analyse the types of errors the models make
5. Have a quick round of feature selection and engineering if necessary
6. Have one or two more quick iterations of the five previous steps
7. Short-list the top three to five most promising models, preferring models that make different types of errors

In [170]:
# installing dependencies
!pip3 install pandas keras torch tqdm

# !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
!pip3 install torch torchvision torchaudio

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [171]:
# importing the data as dataframes
import pandas as pd

emotions_df_train_ready = pd.read_csv('../data/emotions_train_ready.csv')

pd.set_option('display.max_columns', None)

##### just to not forget the mapping

- 0: 'sadness'
- 1: 'joy'
- 2: 'love'
- 3: 'anger'
- 4: 'fear'
- 5: 'surprise'
- 6: 'neutral'

In [172]:
emotions_df_train_ready.value_counts()

Unnamed: 0  text                                                                                                                                                                                                            label
15          [NAME] is always the dirtiest psychopath                                                                                                                                                                        3        1
400306      i have found the more i suppress the vampire inside me the more human emotions i feel the days i don t feel like feeling anymore are the days i m dangerous                                                     3        1
400224      i describe his gentle nature and how being in his presence made me feel more tranquil                                                                                                                           1        1
400225      Man, what a very tolerant person he is.                              

In [173]:
import re

# Custom tokenization using regular expressions
def custom_tokenize(text):
    # Regular expression for splitting on whitespace and keeping punctuation
    tokens = re.findall(r"\b\w+\b|[!?.]", text)
    return tokens

# Applying custom tokenization to the entire dataset
tokenized_texts = [custom_tokenize(text) for text in emotions_df_train_ready['text']]

# Example of tokenized text
tokenized_texts_example = tokenized_texts[0]
tokenized_texts_example

['i', 'arent', 'feeling', 'too', 'sleep', 'deprived']

In [174]:
from collections import Counter

# Flatten the list of tokenized texts
all_tokens = [token for text in tokenized_texts for token in text]
vocabulary = set(all_tokens)

# Mapping tokens to integers
token_to_int = {token: i+1 for i, token in enumerate(vocabulary)}


In [175]:
# Define the maximum vocabulary size
max_vocab_size = 10000  # You can adjust this number as needed

# Count the frequency of each token
token_counts = Counter(all_tokens)

# Keep only the most frequent tokens
most_common_tokens = [token for token, count in token_counts.most_common(max_vocab_size)]

# Create a new token-to-integer mapping, including a token for unknown words
token_to_int = {token: i+1 for i, token in enumerate(most_common_tokens)}
token_to_int["<UNK>"] = len(most_common_tokens) + 1

In [176]:
# Encode the tokenized texts using the updated mapping
encoded_sequences = []
for text in tokenized_texts:
    encoded_text = [token_to_int.get(token, token_to_int["<UNK>"]) for token in text]
    encoded_sequences.append(encoded_text)

In [177]:
import numpy as np

# Finding the maximum sequence length
max_seq_length = max(len(seq) for seq in encoded_sequences)

# Padding the sequences
padded_sequences = np.array([seq + [0]*(max_seq_length - len(seq)) for seq in encoded_sequences])

In [178]:
# Extracting the labels
labels = emotions_df_train_ready['label'].values

# One-hot encoding the labels
num_classes = len(np.unique(labels))
one_hot_labels = np.eye(num_classes)[labels]


In [179]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and the rest
X_temp, X_test, y_temp, y_test = train_test_split(
    padded_sequences, one_hot_labels, test_size=0.2, stratify=labels, random_state=42)

# Further splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)  # 0.25 x 0.8 = 0.2


X_train, X_val, X_test, y_train, y_val, y_test


(array([[  1,  68,  55, ...,   0,   0,   0],
        [  1,  20,  90, ...,   0,   0,   0],
        [ 11, 235, 286, ...,   0,   0,   0],
        ...,
        [  1, 308,  31, ...,   0,   0,   0],
        [  1,  22,  35, ...,   0,   0,   0],
        [  1,  20, 238, ...,   0,   0,   0]]),
 array([[  17,    2,   15, ...,    0,    0,    0],
        [   1,  156,  326, ...,    0,    0,    0],
        [   1,    2,  217, ...,    0,    0,    0],
        ...,
        [   1,   20,  319, ...,    0,    0,    0],
        [1062,   19, 1107, ...,    0,    0,    0],
        [  26,   18,  310, ...,    0,    0,    0]]),
 array([[   1, 2132,  107, ...,    0,    0,    0],
        [   1,   68,    3, ...,    0,    0,    0],
        [ 366,   19,  223, ...,    0,    0,    0],
        ...,
        [  26,  345,   53, ...,    0,    0,    0],
        [2015,    7,  171, ...,    0,    0,    0],
        [   1,    2,  764, ...,    0,    0,    0]]),
 array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 

In [180]:
# Saving the train val test split so we can work in multiple files:

import pickle

# Save the arrays to a file
with open('../data/emption_dataset_partitions_train_ready.pkl', 'wb') as f:
    pickle.dump((X_train, X_val, X_test, y_train, y_val, y_test), f)

_____

# Defining the model

In [91]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [92]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src2 = self.multihead_attn(src, src, src, attn_mask=src_mask,
                                   key_padding_mask=src_key_padding_mask)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src


In [139]:
import math


class TransformerModel(nn.Module):
    def __init__(self, ntoken, d_model, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.transformer_layers = nn.ModuleList([TransformerBlock(d_model, nhead, nhid, dropout) for _ in range(nlayers)])
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, num_classes)
        

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        for mod in self.transformer_layers:
            src = mod(src, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        
       # Select the output of the first token for classification
        src = src[:, 0, :]
        output = self.decoder(src)
        return output


def createTransformer(hyperparams: dict):
    model = TransformerModel(
            hyperparams['ntokens'],
            hyperparams['d_model'],
            hyperparams['nhead'],
            hyperparams['nhid'],
            hyperparams['nlayers'],
            hyperparams['dropout']
        )
    return model

---------------

# Training the model

trying other tokenization

In [198]:
# hyperparams
hyperparams_v1 = {
    'ntokens': len(most_common_tokens) + 2, # size of vocabulary
    'd_model': 128, # embedding dimension
    'nhead': 4    , # number of heads in multi-head attention models
    'nhid': 512   ,# dimension of the feedforward network model in nn.TransformerEncoder
    'nlayers': 4  , # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    'dropout': 0.1, # dropout probability
}

batch_size = 32
vobab_size = 10000

In [190]:
# creating the model

ntokens = len(most_common_tokens) + 2 # size of vocabulary
d_model = 128 # embedding dimension
nhead = 4     # number of heads in multi-head attention models
nhid = 512   # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4   # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.1 # dropout probability

model = createTransformer(hyperparams_v1)


In [191]:
# first iteration of loader for old data splits

from torch.utils.data import Dataset, DataLoader

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Instantiate the dataset
dataset = TextDataset(X_train, y_train)

# Create data loaders
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [199]:
class Tokenizer:
    def __init__(self, texts, max_vocab_size):
        self.vocab = self.build_vocab(texts, max_vocab_size)
        self.word_index = {word: i for i, word in enumerate(self.vocab)}
        self.index_word = {i: word for word, i in self.word_index.items()}

    def build_vocab(self, texts, max_vocab_size):
        # Count word frequencies
        word_freq = {}
        for text in texts:
            for word in text.split():
                word_freq[word] = word_freq.get(word, 0) + 1

        # Sort words by frequency and take the top 'max_vocab_size' words
        vocab = [word for word, freq in sorted(word_freq.items(), key=lambda x: -x[1])]
        vocab = vocab[:max_vocab_size]

        # Add special tokens
        vocab = ['<pad>', '<unk>'] + vocab
        return vocab

    def encode(self, text):
        return [self.word_index.get(word, self.word_index['<unk>']) for word in text.split()]

    def decode(self, sequence):
        return ' '.join([self.index_word.get(index, '<unk>') for index in sequence])


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import numpy as np


data = emotions_df_train_ready

tokenizer = Tokenizer(data['text'], max_vocab_size=10000)

# Convert texts to sequences
sequences = [tokenizer.encode(text) for text in data['text']]

# Padding sequences
max_length = 128  # or another value that suits your model
padded_sequences = np.array([np.pad(seq, (0, max_length - len(seq)), mode='constant') for seq in sequences])

# One-hot encode labels
labels = np.array(pd.get_dummies(data['label']))

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, labels, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert to PyTorch tensors
X_train, y_train = torch.tensor(X_train), torch.tensor(y_train)
X_val, y_val = torch.tensor(X_val), torch.tensor(y_val)
X_test, y_test = torch.tensor(X_test), torch.tensor(y_test)

# Create DataLoader for each dataset
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=32, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32, shuffle=False)

In [209]:
X_train, y_train

(tensor([[ 25,  55,  55,  ...,   0,   0,   0],
         [ 27,  65,  50,  ...,   0,   0,   0],
         [  2,   3, 664,  ...,   0,   0,   0],
         ...,
         [  2,  65,   7,  ...,   0,   0,   0],
         [  2,   3, 417,  ...,   0,   0,   0],
         [  2,   3,   9,  ...,   0,   0,   0]]),
 tensor([[False, False, False,  ...,  True, False, False],
         [ True, False, False,  ..., False, False, False],
         [False, False, False,  ...,  True, False, False],
         ...,
         [ True, False, False,  ..., False, False, False],
         [ True, False, False,  ..., False, False, False],
         [False,  True, False,  ..., False, False, False]]))

In [201]:
## Switch to M! GPU acceleration 

import torch

# Check if MPS (Apple's M1 GPU) is available and use it; otherwise, use CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device).float()


Using device: mps


In [202]:
# checking if we have the correct output from the model

sequence_length = 50

dummy_input = torch.randint(0, ntokens, (sequence_length, batch_size))
dummy_output = model(dummy_input.to(device))
print(dummy_output.shape)  # Should be [batch_size, num_classes]

torch.Size([50, 7])


In [203]:
import os
import torch

def save_model_if_not_exists(model: TransformerModel, file_path):
    if os.path.exists(file_path):
        print(f"File '{file_path}' already exists. Model not saved to avoid overwriting.")
    else:
        torch.save(model.state_dict(), file_path)
        print(f"Model saved successfully to '{file_path}'.")

In [204]:
import torch.optim as optim

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [217]:
def train_one_epoch(model, train_loader, optimizer, criterion, device):
    model.train()  # Set the model to training mode
    train_loss = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.long().to(device), torch.argmax(labels.float(), dim=1).long().to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f'Training Loss: {avg_train_loss:.4f}', flush=True)

def validate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.long().to(device), torch.argmax(labels.float(), dim=1).long().to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_predictions.extend(preds.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    print(f'Validation Loss: {avg_val_loss:.4f}', flush=True)
    return all_true_labels, all_predictions

from sklearn.metrics import classification_report

# Example of a function to print classification report
def print_classification_report(true_labels, predictions):
    print(classification_report(true_labels, predictions))


In [219]:
# Training

num_epochs = 8

# Training loop
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    train_one_epoch(model, train_loader, optimizer, criterion, device)
    true_labels, predictions = validate(model, val_loader, criterion, device)
    print_classification_report(true_labels, predictions)

## currently 8 epochs
# Save the model's state dictionary
save_model_if_not_exists(model, './models/emotions_transformer_model_2.pth')

Epoch 1/8


ValueError: too many values to unpack (expected 2)

In [212]:
# Initialize the model (ensure it has the same architecture as the saved one)
loaded_model = createTransformer(hyperparams_v1)


loaded_state_dict = torch.load('./models/emotions_transformer_model_2.pth')

loaded_model.load_state_dict(loaded_state_dict)

# If you are using the MPS device
loaded_model = loaded_model.to(device)

loaded_model.eval()

loaded_model = loaded_model.to(device).float()
# model.eval()

-----------------

## Validation

In [213]:
val_dataset = TextDataset(X_val, y_val)  # Assuming TextDataset is your custom dataset class
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [214]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Lists to store true and predicted labels
true_labels = []
predicted_labels = []

loaded_model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for inputs, labels in tqdm(val_loader, desc='Validating', leave=False):
        inputs = inputs.to(device).long()
        
        # Assuming labels are one-hot encoded
        true_class_indices = np.argmax(labels.cpu().numpy(), axis=1)
        
        outputs = loaded_model(inputs)
        # print('🚀 ~ file: 5-training-models.ipynb:18 ~ outputs.shape:', outputs.shape)
        
        preds = torch.argmax(outputs, dim=1)  # Convert outputs to class indices
        # print('🚀 ~ file: 5-training-models.ipynb:21 ~ preds.shape:', preds.shape)
        
        add_to_preds = preds.cpu().numpy().tolist()
        # print('🚀 ~ file: 5-training-models.ipynb:21 ~ add_to_preds:', add_to_preds)
        
        add_to_true = true_class_indices.tolist()
        # print('🚀 ~ file: 5-training-models.ipynb:21 ~ add_to_true:', add_to_true)
        
        
        true_labels.extend(add_to_true)  # Convert to list if not already
        predicted_labels.extend(add_to_preds)  # Convert to list
        

                                                             

In [215]:
predicted_labels

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [216]:
# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1 Score: {f1:.4f}")

Validation Accuracy: 0.3336
Validation Precision: 0.0477
Validation Recall: 0.1429
Validation F1 Score: 0.0715


  _warn_prf(average, modifier, msg_start, len(result))
