This notebook contains code for hyperparameter searching for some of the models. Data related to the hyperparm search is saved to "logs/optuna.db". This can be easily visualized and analyzed using the Optuna VSCode Extension or otherwise.

In [1]:
from __future__ import print_function

import torch

import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from models.LSTMs import LSTMWithWC, LSTM
from models.transformers import Encoder
from models.MLP import MLP

from utils.tokenizers import *
from utils.datasets import *
from utils.utils import clean_original_content
import random
device = "cuda" if torch.cuda.is_available() else "cpu"

import optuna

# Set the seeds
seed = 2024
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [2]:

data = pd.read_csv('data/dataset(clean).csv')
data = shuffle(data, random_state=seed)

# data should be the same as training set in train.py - no peeking at test set
data = data.iloc[:10000]

train_n = round(len(data)*0.8)

train_data = data.iloc[:train_n]
val_data = data.iloc[train_n:]

emotion_to_int = {
    "disappointed": 0,
    "happy": 1,
    "angry": 2,
}
    
labels_train = torch.tensor(list(train_data.Emotion.map(lambda x: emotion_to_int[x])))
labels_val = torch.tensor(list(val_data.Emotion.map(lambda x: emotion_to_int[x])))

tweets_train = list(train_data.OriginalContent)
tweets_val = list(val_data.OriginalContent)

tweets_train = clean_original_content(tweets_train)
tweets_val = clean_original_content(tweets_val)

In [3]:
batch_size = 64
block_size = 128

In [4]:
# using BPE tokenizer for all classifiers here
tokenizer = CustomBPETokenizer(use_existing_tokenizer=True, use_existing_tokens=True)

### MLP
**NOTE:** hyperparameters chosen for MLP were also used for MLPWithFastText

In [5]:
def train_mlp(lr, wd, emb_dim, hidden_dim):
    model = MLP(tokenizer.get_vocab_size(), emb_dim=emb_dim, block_size=block_size, hidden_dim=hidden_dim, num_classes=3).to(device)
    train_dataset = model.get_dataset(tweets_train, labels_train, tokenizer, tag='hparam_search_train')
    val_dataset = model.get_dataset(tweets_val, labels_val, tokenizer, tag='hparam_search_val')
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    train_loss, val_loss, train_acc, val_acc = model.train_model(train_dataset, val_dataset, optimizer, batch_size)
    return val_loss

def objective(trial):
    lr = trial.suggest_float("lr", 1e-4, 1e-1)
    wd = trial.suggest_float("wd", 1e-4, 1e-2)
    emb_dim = 2** trial.suggest_int("emb_dim_exponent", 4, 6)
    hidden_dim = 2** trial.suggest_int("hidden_dim_exponent", 6, 8)
    return train_mlp(lr, wd, emb_dim, hidden_dim)

study = optuna.create_study(direction='minimize', study_name='MLPWithFastText Study',  storage='sqlite:///logs/optuna.db')
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(f'Best params: {best_params}')

[I 2024-05-02 20:52:18,467] A new study created in RDB with name: MLPWithFastText Study
100%|██████████| 125/125 [00:00<00:00, 296.98batch/s]
[I 2024-05-02 20:52:20,169] Trial 0 finished with value: 1.0976 and parameters: {'lr': 0.06606611713357255, 'wd': 0.0005542785283686457, 'emb_dim_exponent': 4, 'hidden_dim_exponent': 6}. Best is trial 0 with value: 1.0976.
100%|██████████| 125/125 [00:00<00:00, 415.57batch/s]
[I 2024-05-02 20:52:21,363] Trial 1 finished with value: 0.9435 and parameters: {'lr': 0.0974274316093112, 'wd': 0.002451206240418282, 'emb_dim_exponent': 4, 'hidden_dim_exponent': 8}. Best is trial 1 with value: 0.9435.
100%|██████████| 125/125 [00:00<00:00, 434.36batch/s]
[I 2024-05-02 20:52:22,568] Trial 2 finished with value: 1.0069 and parameters: {'lr': 0.08551211243929724, 'wd': 0.0016987806218184852, 'emb_dim_exponent': 4, 'hidden_dim_exponent': 7}. Best is trial 1 with value: 0.9435.
100%|██████████| 125/125 [00:00<00:00, 441.20batch/s]
[I 2024-05-02 20:52:23,641] T

Best params: {'lr': 0.0035524933572427322, 'wd': 0.0063867952907142875, 'emb_dim_exponent': 6, 'hidden_dim_exponent': 7}


### Encoder

In [6]:
def train_encoder(lr, wd, emb_dim, heads, blocks):
    model = Encoder(tokenizer.get_vocab_size(), emb_dim=emb_dim, block_size=block_size, num_classes=3, heads=heads, blocks=blocks).to(device)
    train_dataset = model.get_dataset(tweets_train, labels_train, tokenizer, tag='hparam_search_train')
    val_dataset = model.get_dataset(tweets_val, labels_val, tokenizer, tag='hparam_search_val')

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    train_loss, val_loss, train_acc, val_acc = model.train_model(train_dataset, val_dataset, optimizer, batch_size)
    return val_loss

def objective(trial):
    lr = trial.suggest_float("lr", 1e-4, 1e-1)
    wd = trial.suggest_float("wd", 1e-4, 1e-2)
    emb_dim = 2** trial.suggest_int("emb_dim_exponent", 4, 6)
    heads = 2** trial.suggest_int("heads_exponent", 1, 4)
    if heads > emb_dim: # head size = dim/heads
        return float('inf')
    blocks = trial.suggest_int("blocks", 1, 4)
    return train_encoder(lr, wd, emb_dim, heads, blocks)

study = optuna.create_study(direction='minimize', study_name='Encoder Study', storage='sqlite:///logs/optuna.db')
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(f'Best params: {best_params}')

[I 2024-05-02 20:52:55,018] A new study created in RDB with name: Encoder Study
100%|██████████| 125/125 [00:00<00:00, 166.55batch/s]
[I 2024-05-02 20:52:56,788] Trial 0 finished with value: 1.8823 and parameters: {'lr': 0.09432714831791314, 'wd': 0.006101943445563018, 'emb_dim_exponent': 4, 'heads_exponent': 1, 'blocks': 1}. Best is trial 0 with value: 1.8823.
100%|██████████| 125/125 [00:01<00:00, 73.43batch/s]
[I 2024-05-02 20:52:59,507] Trial 1 finished with value: 22.1904 and parameters: {'lr': 0.055598962254645105, 'wd': 0.008006598324454977, 'emb_dim_exponent': 6, 'heads_exponent': 2, 'blocks': 2}. Best is trial 0 with value: 1.8823.
100%|██████████| 125/125 [00:01<00:00, 72.88batch/s]
[I 2024-05-02 20:53:02,364] Trial 2 finished with value: 2.5117 and parameters: {'lr': 0.01677529300702094, 'wd': 0.008657999196414986, 'emb_dim_exponent': 5, 'heads_exponent': 2, 'blocks': 2}. Best is trial 0 with value: 1.8823.
100%|██████████| 125/125 [00:02<00:00, 47.62batch/s]
[I 2024-05-02 2

Best params: {'lr': 0.010405639409764685, 'wd': 0.006273954885390327, 'emb_dim_exponent': 6, 'heads_exponent': 3, 'blocks': 2}


### LSTM With WC

In [7]:
def train_lstm_with_wc(lr, wd, emb_dim, hidden_dim, bidirectional):
    model = LSTMWithWC(tokenizer.get_vocab_size(), emb_dim=emb_dim, hidden_dim=hidden_dim, num_classes=3, bidirectional=bidirectional).to(device)
    train_dataset = model.get_dataset(tweets_train, labels_train, tokenizer, tag='hparam_search_train')
    val_dataset = model.get_dataset(tweets_val, labels_val, tokenizer, tag='hparam_search_val')
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    train_loss, val_loss, train_acc, val_acc = model.train_model(train_dataset, val_dataset, optimizer, batch_size)
    return val_loss

def objective(trial):
    lr = trial.suggest_float("lr", 1e-4, 1e-1)
    wd = trial.suggest_float("wd", 1e-4, 1e-2)
    emb_dim = 2** trial.suggest_int("emb_dim_exponent", 4, 6)
    hidden_dim = 2** trial.suggest_int("hidden_dim_exponent", 6, 8)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    return train_lstm_with_wc(lr, wd, emb_dim, hidden_dim, bidirectional)

study = optuna.create_study(direction='minimize', study_name='LSTMWithWC Study',  storage='sqlite:///logs/optuna.db')
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(f'Best params: {best_params}')

[I 2024-05-02 20:55:18,871] A new study created in RDB with name: LSTMWithWC Study
100%|██████████| 125/125 [00:11<00:00, 10.70batch/s]
[I 2024-05-02 20:55:32,431] Trial 0 finished with value: 1.1344 and parameters: {'lr': 0.056000803353149727, 'wd': 0.0026616276603737363, 'emb_dim_exponent': 6, 'hidden_dim_exponent': 6, 'bidirectional': False}. Best is trial 0 with value: 1.1344.
100%|██████████| 125/125 [00:12<00:00, 10.25batch/s]
[I 2024-05-02 20:55:46,464] Trial 1 finished with value: 1.2229 and parameters: {'lr': 0.08716175456323025, 'wd': 0.004426778899577578, 'emb_dim_exponent': 6, 'hidden_dim_exponent': 8, 'bidirectional': False}. Best is trial 0 with value: 1.1344.
100%|██████████| 125/125 [00:22<00:00,  5.56batch/s]
[I 2024-05-02 20:56:11,850] Trial 2 finished with value: 1.1317 and parameters: {'lr': 0.07018159768774249, 'wd': 0.008108052914573942, 'emb_dim_exponent': 6, 'hidden_dim_exponent': 7, 'bidirectional': True}. Best is trial 2 with value: 1.1317.
100%|██████████| 12

Best params: {'lr': 0.010522219071701554, 'wd': 0.0011826148520623315, 'emb_dim_exponent': 4, 'hidden_dim_exponent': 7, 'bidirectional': True}


### LSTM

In [8]:
def train_lstm_with_wc(lr, wd, emb_dim, hidden_dim, bidirectional):
    model = LSTM(tokenizer.get_vocab_size(), emb_dim=emb_dim, hidden_dim=hidden_dim, num_classes=3, bidirectional=bidirectional).to(device)
    train_dataset = model.get_dataset(tweets_train, labels_train, tokenizer, tag='hparam_search_train')
    val_dataset = model.get_dataset(tweets_val, labels_val, tokenizer, tag='hparam_search_val')

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    train_loss, val_loss, train_acc, val_acc = model.train_model(train_dataset, val_dataset, optimizer, batch_size)
    return val_loss

def objective(trial):
    lr = trial.suggest_float("lr", 1e-4, 1e-1)
    wd = trial.suggest_float("wd", 1e-4, 1e-2)
    emb_dim = 2** trial.suggest_int("emb_dim_exponent", 4, 6)
    hidden_dim = 2** trial.suggest_int("hidden_dim_exponent", 6, 8)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    return train_lstm_with_wc(lr, wd, emb_dim, hidden_dim, bidirectional)

study = optuna.create_study(direction='minimize', study_name='LSTM Study',  storage='sqlite:///logs/optuna.db')
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(f'Best params: {best_params}')

[I 2024-05-02 21:06:04,897] A new study created in RDB with name: LSTM Study
100%|██████████| 125/125 [00:02<00:00, 61.19batch/s]
[I 2024-05-02 21:06:08,061] Trial 0 finished with value: 0.9987 and parameters: {'lr': 0.07506101972874252, 'wd': 0.009470351957146378, 'emb_dim_exponent': 6, 'hidden_dim_exponent': 6, 'bidirectional': True}. Best is trial 0 with value: 0.9987.
100%|██████████| 125/125 [00:02<00:00, 62.03batch/s]
[I 2024-05-02 21:06:10,995] Trial 1 finished with value: 0.8738 and parameters: {'lr': 0.042060929078430495, 'wd': 0.0018129209212587678, 'emb_dim_exponent': 4, 'hidden_dim_exponent': 6, 'bidirectional': True}. Best is trial 1 with value: 0.8738.
100%|██████████| 125/125 [00:01<00:00, 81.54batch/s]
[I 2024-05-02 21:06:13,516] Trial 2 finished with value: 0.9755 and parameters: {'lr': 0.07306580603639927, 'wd': 0.007691253955753859, 'emb_dim_exponent': 4, 'hidden_dim_exponent': 6, 'bidirectional': False}. Best is trial 1 with value: 0.8738.
100%|██████████| 125/125 [

Best params: {'lr': 0.015879729475611434, 'wd': 0.00019051865663229051, 'emb_dim_exponent': 4, 'hidden_dim_exponent': 6, 'bidirectional': False}
