# 0. Setup

In [None]:
# Standard library
import os
import sys
import re
import string
import warnings
import pathlib
from pathlib import Path

# Core / scientific
import numpy as np
import pandas as pd
from tqdm import tqdm

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils as nn_utils
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter

# Transformers
from transformers import BertTokenizer, BertModel

# scikit-learn (+ Intel acceleration)
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support

# Gensim / NLTK
import gensim
import gensim.downloader
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Serialization
import joblib
from joblib import dump, load

# Pandas display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
rng = np.random.RandomState(SEED)

# Warnings
warnings.simplefilter("ignore", FutureWarning)

# Local packages
sys.path.insert(0, str(Path.cwd() / "src"))
from functions import textprep
from functions import evalkit
from project_paths import DATA_DIR, FILES_DIR, TSB_DIR

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [236]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# PyTorch info
print("PyTorch version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
print("CUDA Version:", torch.version.cuda)

PyTorch version: 2.5.1+cu124
CUDA Available: True
GPU: NVIDIA GeForce GTX 970
CUDA Version: 12.4


In [237]:
# Download the Word2Vec embeddings
word2vec = gensim.downloader.load('word2vec-google-news-300')
# Load the pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
# Load the data
reddit_data = pd.read_csv(DATA_DIR / "reddit_sentiment_processed.csv")

In [238]:
class_dict_bert_sentiment = {0: "Very negative",
                             1: "Negative",
                             2: "Neutral",
                             3: "Positive",
                             4: "Very positive"}


# I. Tokenization and embeddings

In [239]:
# Words to keep (won't be removed as stopwords)
custom_words_to_keep = [
    'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 
    'doesn', "doesn't", 'don', "don't", 'hadn', "hadn't", 
    'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
    'mustn', "mustn't", 'wasn', "wasn't", 'weren', "weren't", 
    "won't", 'wouldn', "wouldn't",
    'above', 'below', 'down', 'up'
]

In [284]:
# cache paths in DATA_DIR
preproc_path = FILES_DIR / "pkl_files" / "clean_tokens_and_body.pkl"
emb_path = FILES_DIR / "pkl_files" / "sentence_embedding.pkl"

tqdm.pandas()
def preprocess_tokens(text):
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    toks = textprep.tokenize_function(text)
    toks = textprep.clean_tokens(toks, custom_words_to_keep=custom_words_to_keep)
    toks = textprep.replace_numerical_tokens(toks)
    return toks

# load or compute clean_tokens / clean_body
if preproc_path.exists():
    tmp = pd.read_pickle(preproc_path)
    reddit_data["clean_tokens"] = tmp.loc[reddit_data.index, "clean_tokens"]
    reddit_data["clean_body"] = tmp.loc[reddit_data.index, "clean_body"]
else:
    reddit_data["clean_tokens"] = reddit_data["body"].fillna("").progress_map(preprocess_tokens)
    reddit_data["clean_body"] = reddit_data["body"].fillna("").progress_map(textprep.clean_body)
    pd.DataFrame({"clean_tokens": reddit_data["clean_tokens"], "clean_body": reddit_data["clean_body"]}).to_pickle(preproc_path)

# load or compute sentence_embedding (single post embedding by averaging word embeddings)
if emb_path.exists():
    reddit_data["sentence_embedding"] = pd.read_pickle(emb_path).reindex(reddit_data.index)
else:
    reddit_data["sentence_embedding"] = reddit_data["clean_tokens"].apply(lambda tokens: textprep.post_to_embedding(tokens, word2vec))
    reddit_data["sentence_embedding"].to_pickle(emb_path)

# II. Train and Test sets preparation

In [241]:
# Split: 60% train, 20% validation, 20% test 
train_data, temp_data = train_test_split(reddit_data, test_size=0.4, random_state=42)  
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  

In [242]:
# Stack embeddings into a single NumPy array
sentence_embeddings_train = np.array(train_data['sentence_embedding'].tolist())  
sentence_embeddings_val = np.array(val_data['sentence_embedding'].tolist()) 
sentence_embeddings_test = np.array(test_data['sentence_embedding'].tolist())  

# Convert sentence embeddings and labels to tensors
X_train = torch.tensor(sentence_embeddings_train, dtype=torch.float32).to(device)
X_val = torch.tensor(sentence_embeddings_val, dtype=torch.float32).to(device)
X_test = torch.tensor(sentence_embeddings_test, dtype=torch.float32).to(device)

In [243]:
# Convert labels to NumPy arrays
labels_train = np.array(train_data['sentiment_label'].tolist())
labels_val = np.array(val_data['sentiment_label'].tolist())
labels_test = np.array(test_data['sentiment_label'].tolist())

y_train = torch.tensor(labels_train, dtype=torch.long).to(device)
y_val = torch.tensor(labels_val, dtype=torch.long).to(device)
y_test = torch.tensor(labels_test, dtype=torch.long).to(device)

In [244]:
# Tokenize the input text for BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def _load_or_tokenize(series, path):
    if path.exists():
        return torch.load(path, map_location='cpu', weights_only=False)
    enc = textprep.bert_tokenize_function(series.fillna('').tolist(), bert_tokenizer)
    torch.save(enc, path)
    return enc

train_path = FILES_DIR / "pt_files" / "X_bert_train.pt"
val_path = FILES_DIR / "pt_files" / "X_bert_val.pt"
test_path = FILES_DIR / "pt_files" / "X_bert_test.pt"

X_bert_train = _load_or_tokenize(train_data["clean_body"], train_path)
X_bert_val = _load_or_tokenize(val_data["clean_body"], val_path)
X_bert_test = _load_or_tokenize(test_data["clean_body"], test_path)

In [245]:
# Load or compute BERT embeddings and save to .npy files
embedding_files = {
    "train": "X_bert_embeddings_train.npy",
    "val": "X_bert_embeddings_val.npy",
    "test": "X_bert_embeddings_test.npy"
}

# Resolve target paths
paths = {k: (Path(v) if Path(v).is_absolute() else (FILES_DIR / "npy_files" / v)) for k, v in embedding_files.items()}
paths["train"].parent.mkdir(parents=True, exist_ok=True)


embeddings = {}
missing = [k for k, p in paths.items() if not p.exists()]

if missing:
    from transformers import BertModel
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    bert_model.eval()

    datasets = {"train": X_bert_train, "val": X_bert_val, "test": X_bert_test}
    for key, dataset in datasets.items():
        if paths[key].exists():
            embeddings[key] = np.load(paths[key], allow_pickle=False)
        else:
            embeddings[key] = np.array([
                textprep.get_bert_embedding({
                    "input_ids": dataset["input_ids"][i].unsqueeze(0),
                    "attention_mask": dataset["attention_mask"][i].unsqueeze(0),
                    device: device
                }) for i in range(len(dataset["input_ids"]))
            ])
            np.save(paths[key], embeddings[key])
else:
    embeddings = {k: np.load(p, allow_pickle=False) for k, p in paths.items()}

X_bert_post_embeddings_train, X_bert_post_embeddings_val, X_bert_post_embeddings_test = embeddings["train"], embeddings["val"], embeddings["test"]

In [246]:
# Create TensorDatasets for Word2Vec based models
train_dataset = TensorDataset(X_train.to(device), y_train.to(device))
val_dataset = TensorDataset(X_val.to(device), y_val.to(device))
test_dataset = TensorDataset(X_test.to(device), y_test.to(device))

# Create DataLoaders for Word2Vec based models
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [247]:
# Create TensorDatasets for BERT based models
train_dataset_bert = TensorDataset(
    X_bert_train["input_ids"].to(device), X_bert_train["attention_mask"].to(device), y_train.to(device))
val_dataset_bert = TensorDataset(
    X_bert_val["input_ids"].to(device), X_bert_val["attention_mask"].to(device), y_val.to(device))
test_dataset_bert = TensorDataset(
    X_bert_test["input_ids"].to(device), X_bert_test["attention_mask"].to(device), y_test.to(device))

# Create DataLoaders for BERT based models
batch_size = 32
train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
val_dataloader_bert = DataLoader(val_dataset_bert, batch_size=batch_size)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size)

# III. Word2Vec + MLP

Model naming convention: 
`<EmbeddingType>_<ArchitectureType>_<LayerConfig>_<Activation>_<Extras>_<Task>`

#### Example Components

- **Embedding Type**: Specify the input representation.  
  Examples: `W2V` for Word2Vec, `BERT`, `GloVe`, `TF-IDF`, etc.

- **Architecture Type**: Include the model type.  
  Examples: `MLP`, `LSTM`, `GRU`, `TF` (Transformer), etc.

- **Layer Configuration**: Use layer sizes or count.  
  Examples: `128-64-32` for layer sizes or `3L` for 3 layers.

- **Activation Function**: Specify the activation function.  
  Examples: `ReLU`, `LeakyReLU`, `Tanh`, etc.

- **Extras**: Include regularization, dropout, or batch normalization (if relevant).  
  Examples: `DO30` for 30% dropout, `BN` for BatchNorm.

- **Task**: Add a suffix to describe the task.  
  Examples: `MC` for multi-class classification, `SC` for single-class classification.

### III.a. Multi-class classification
- target variable: `BERT-Sentiment`
- input: vector dim (300,1); sentence vector representation

In [248]:
class W2V_MLP_1L_ReLU_MC(nn.Module):
    def __init__(self):
        super(W2V_MLP_1L_ReLU_MC, self).__init__()
        self.fc1 = nn.Linear(300, 5).to(device)      # Single-layer network (input to output)
        self.relu = nn.ReLU().to(device)             # ReLU activation
        self.softmax = nn.Softmax(dim=1).to(device)  # Softmax for multi-class output

    def forward(self, x):
        x = x.to(device)
        x = self.fc1(x)             # Direct pass to output layer
        x = self.softmax(x)         # Softmax activation for probabilities
        return x

In [249]:
class W2V_MLP_3L_ReLU_MC(nn.Module):
    def __init__(self):
        super(W2V_MLP_3L_ReLU_MC, self).__init__()
        self.fc1 = nn.Linear(300, 128).to(device)  # Input layer to hidden layer 1
        self.fc2 = nn.Linear(128, 64).to(device)   # Hidden layer 1 to hidden layer 2
        self.fc3 = nn.Linear(64, 5).to(device)     # Hidden layer 2 to output layer (5 classes)
        self.relu = nn.ReLU().to(device)           # ReLU activation
        self.softmax = nn.Softmax(dim=1).to(device)  # Softmax activation for multi-class output

    def forward(self, x):
        x = x.to(device)
        x = self.relu(self.fc1(x))  # Layer 1 with ReLU activation
        x = self.relu(self.fc2(x))  # Layer 2 with ReLU activation
        x = self.fc3(x)             # Output layer (logits)
        x = self.softmax(x)         # Softmax activation for probabilities
        return x


In [250]:
class W2V_MLP_3L_ReLU_MC_DO30(nn.Module):
    def __init__(self):
        super(W2V_MLP_3L_ReLU_MC_DO30, self).__init__()
        self.fc1 = nn.Linear(300, 128).to(device)  # Input layer to hidden layer 1
        self.dropout1 = nn.Dropout(0.3).to(device) # Dropout after layer 1 (30%)
        self.fc2 = nn.Linear(128, 64).to(device)   # Hidden layer 1 to hidden layer 2
        self.dropout2 = nn.Dropout(0.3).to(device) # Dropout after layer 2 (30%)
        self.fc3 = nn.Linear(64, 5).to(device)     # Hidden layer 2 to output layer (5 classes)
        self.relu = nn.ReLU().to(device)           # ReLU activation
        self.softmax = nn.Softmax(dim=1).to(device)  # Softmax activation for multi-class output

    def forward(self, x):
        x = x.to(device)
        x = self.relu(self.fc1(x))              # Layer 1 with ReLU
        x = self.dropout1(x)                    # Dropout after layer 1
        x = self.relu(self.fc2(x))              # Layer 2 with ReLU
        x = self.dropout2(x)                    # Dropout after layer 2
        x = self.fc3(x)                         # Output layer (logits)
        x = self.softmax(x)                     # Softmax activation for probabilities
        return x


In [251]:
class W2V_MLP_5L_ReLU_MC(nn.Module):
    def __init__(self):
        super(W2V_MLP_5L_ReLU_MC, self).__init__()
        self.fc1 = nn.Linear(300, 128).to(device)  # Input layer to hidden layer 1
        self.fc2 = nn.Linear(128, 64).to(device)   # Hidden layer 1 to hidden layer 2
        self.fc3 = nn.Linear(64, 32).to(device)    # Hidden layer 2 to hidden layer 3
        self.fc4 = nn.Linear(32, 16).to(device)    # Hidden layer 3 to hidden layer 4
        self.fc5 = nn.Linear(16, 5).to(device)     # Hidden layer 4 to output layer (5 classes)
        self.relu = nn.ReLU().to(device)           # ReLU activation
        self.softmax = nn.Softmax(dim=1).to(device)  # Softmax activation for multi-class output

    def forward(self, x):
        x = x.to(device)
        x = self.relu(self.fc1(x))  # Pass through first layer with ReLU
        x = self.relu(self.fc2(x))  # Pass through second layer with ReLU
        x = self.relu(self.fc3(x))  # Pass through third layer with ReLU
        x = self.relu(self.fc4(x))  # Pass through fourth layer with ReLU
        x = self.fc5(x)             # Output layer (logits)
        x = self.softmax(x)         # Softmax activation for probabilities
        return x

In [252]:
class W2V_MLP_5L_ReLU_MC_BN2(nn.Module):
    def __init__(self):
        super(W2V_MLP_5L_ReLU_MC_BN2, self).__init__()
        self.fc1 = nn.Linear(300, 128).to(device)  # Input layer to hidden layer 1
        self.fc2 = nn.Linear(128, 64).to(device)   # Hidden layer 1 to hidden layer 2
        self.bn2 = nn.BatchNorm1d(64).to(device)   # BatchNorm after layer 2
        self.fc3 = nn.Linear(64, 32).to(device)    # Hidden layer 2 to hidden layer 3
        self.fc4 = nn.Linear(32, 16).to(device)    # Hidden layer 3 to hidden layer 4
        self.bn4 = nn.BatchNorm1d(16).to(device)   # BatchNorm after layer 4
        self.fc5 = nn.Linear(16, 5).to(device)     # Hidden layer 4 to output layer (5 classes)
        self.relu = nn.ReLU().to(device)           # ReLU activation
        self.softmax = nn.Softmax(dim=1).to(device)  # Softmax activation for multi-class output

    def forward(self, x):
        x = x.to(device)
        x = self.relu(self.fc1(x))                 # Layer 1 with ReLU
        x = self.relu(self.bn2(self.fc2(x)))       # Layer 2 with BatchNorm and ReLU
        x = self.relu(self.fc3(x))                 # Layer 3 with ReLU
        x = self.relu(self.bn4(self.fc4(x)))       # Layer 4 with BatchNorm and ReLU
        x = self.fc5(x)                            # Output layer (logits)
        x = self.softmax(x)                        # Softmax activation for probabilities
        return x

In [253]:
class W2V_MLP_5L_ReLU_MC_BN4(nn.Module):
    def __init__(self):
        super(W2V_MLP_5L_ReLU_MC_BN4, self).__init__()
        self.fc1 = nn.Linear(300, 128).to(device)  # Input layer to hidden layer 1
        self.bn1 = nn.BatchNorm1d(128).to(device)  # BatchNorm after layer 1
        self.fc2 = nn.Linear(128, 64).to(device)   # Hidden layer 1 to hidden layer 2
        self.bn2 = nn.BatchNorm1d(64).to(device)   # BatchNorm after layer 2
        self.fc3 = nn.Linear(64, 32).to(device)    # Hidden layer 2 to hidden layer 3
        self.bn3 = nn.BatchNorm1d(32).to(device)   # BatchNorm after layer 3
        self.fc4 = nn.Linear(32, 16).to(device)    # Hidden layer 3 to hidden layer 4
        self.bn4 = nn.BatchNorm1d(16).to(device)   # BatchNorm after layer 4
        self.fc5 = nn.Linear(16, 5).to(device)     # Hidden layer 4 to output layer (5 classes)
        self.relu = nn.ReLU().to(device)           # ReLU activation
        self.softmax = nn.Softmax(dim=1).to(device)  # Softmax activation for multi-class output

    def forward(self, x):
        x = x.to(device)
        x = self.relu(self.bn1(self.fc1(x)))  # Layer 1 with BatchNorm and ReLU
        x = self.relu(self.bn2(self.fc2(x)))  # Layer 2 with BatchNorm and ReLU
        x = self.relu(self.bn3(self.fc3(x)))  # Layer 3 with BatchNorm and ReLU
        x = self.relu(self.bn4(self.fc4(x)))  # Layer 4 with BatchNorm and ReLU
        x = self.fc5(x)                       # Output layer (logits)
        x = self.softmax(x)                   # Softmax activation for probabilities
        return x


### III.b. Model training

In [254]:
''' Training loop for W2V based models '''

# Models to train
models = [
    # W2V_MLP_1L_ReLU_MC().to(device),
    # W2V_MLP_3L_ReLU_MC().to(device),
    # W2V_MLP_3L_ReLU_MC_DO30().to(device),
    # W2V_MLP_5L_ReLU_MC().to(device),
    # W2V_MLP_5L_ReLU_MC_BN2().to(device),
    # W2V_MLP_5L_ReLU_MC_BN4().to(device)
]

# Define loss and optimizer
criterion = nn.CrossEntropyLoss().to(device)

def get_optimizer(model):
    return optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

def validate_model(model, val_dataloader, criterion, device):
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return val_loss / len(val_dataloader), correct / total

num_epochs = 20
patience = 3

# Directory setup
(FILES_DIR / "pth_files").mkdir(parents=True, exist_ok=True)
(TSB_DIR / "W2V").mkdir(parents=True, exist_ok=True)

writer = SummaryWriter(log_dir=str(TSB_DIR / "W2V"))

for model in models:
    model = model.to(device)
    optimizer = get_optimizer(model)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)
    
    best_val_loss = float('inf')
    no_improvement_count = 0
    
    print(f"Training {model.__class__.__name__}")
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for inputs, labels_tensor in train_dataloader:
            inputs, labels_tensor = inputs.to(device), labels_tensor.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels_tensor)
            loss.backward()
            nn_utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            running_loss += loss.item()
        
        avg_train_loss = running_loss / len(train_dataloader)
        
        val_loss, val_accuracy = validate_model(model, val_dataloader, criterion, device)
        scheduler.step(val_loss)
        
        writer.add_scalar(f'{model.__class__.__name__}/Train_Loss', avg_train_loss, epoch)
        writer.add_scalar(f'{model.__class__.__name__}/Val_Loss', val_loss, epoch)
        writer.add_scalar(f'{model.__class__.__name__}/Val_Accuracy', val_accuracy, epoch)
        
        print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f} Val Loss: {val_loss:.4f} Val Accuracy: {val_accuracy:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improvement_count = 0
            torch.save(model.state_dict(), FILES_DIR / "pth_files" / f"{model.__class__.__name__}_best_weights.pth")
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print("Early stopping triggered.")
                break

    print(f"Finished training {model.__class__.__name__}.\n")

writer.close()

### III.c. Testing the models

In [255]:
''' CHOOSE MODEL TO TEST '''
models = [
    W2V_MLP_1L_ReLU_MC().to(device),
    W2V_MLP_3L_ReLU_MC().to(device),
    # W2V_MLP_3L_ReLU_MC_DO30().to(device),
    # W2V_MLP_5L_ReLU_MC().to(device),
    # W2V_MLP_5L_ReLU_MC_BN2().to(device),
    # W2V_MLP_5L_ReLU_MC_BN4().to(device)
]

# Load the weights
for model in models:
    _best = FILES_DIR / "pth_files" / f"{model.__class__.__name__}_best_weights.pth"
    _std = FILES_DIR / "pth_files" / f"{model.__class__.__name__}_weights.pth"
    _path = _best if _best.exists() else _std
    model.load_state_dict(torch.load(_path, map_location=device, weights_only=False))
    model.eval()

In [256]:
''' TESTING ON VALIDATION AND TEST SETS '''
for model in models:
    print(f"\n--- {model.__class__.__name__} ---")
    evalkit.evaluate(model, X_val, y_val, dataset_name="Validation", device=device)
    evalkit.evaluate(model, X_test, y_test, dataset_name="Test", device=device)


--- W2V_MLP_1L_ReLU_MC ---
Validation set:  Accuracy: 0.6737  F1(macro): 0.2925  F1(weighted): 0.6169
Test set:  Accuracy: 0.6798  F1(macro): 0.2951  F1(weighted): 0.6226

--- W2V_MLP_3L_ReLU_MC ---
Validation set:  Accuracy: 0.6998  F1(macro): 0.3037  F1(weighted): 0.6404
Test set:  Accuracy: 0.7048  F1(macro): 0.3058  F1(weighted): 0.6452


In [257]:
''' TESTING ON INPUT SENTENCE'''
test_sentence = "This is very negative sentence, omg, I hate it"
# test_sentence = "The price is going down, you will lose all your money"
# test_sentence = "Bitcoin is going up guys, we will be rich, nice"
# test_sentence = "omg bro this can't be real, we lost, price is very low"

test_sentence_token_words = preprocess_tokens(test_sentence)
test_sentence_embedding = textprep.post_to_embedding(test_sentence_token_words, word2vec)
test_sentence_tensor = torch.tensor(test_sentence_embedding, dtype=torch.float32).unsqueeze(0).to(device)

for model in models:
    print(f"\n--- {model.__class__.__name__} ---")
    model.eval()
    with torch.no_grad():
        out = model(test_sentence_tensor)
        pred = torch.argmax(torch.softmax(out, dim=1), dim=1).item()
    sentiment = class_dict_bert_sentiment[pred]
    print(f"Predicted Sentiment: {sentiment}")



--- W2V_MLP_1L_ReLU_MC ---
Predicted Sentiment: Very negative

--- W2V_MLP_3L_ReLU_MC ---
Predicted Sentiment: Very negative


# IV. Word2Vec + SVM

### IV.a. Hyperparameters tuning

In [258]:
pkl_dir = FILES_DIR / "pkl_files"
pkl_dir.mkdir(parents=True, exist_ok=True)
grid_path = pkl_dir / "svm_W2V_grid_search.pkl"

# Find the best hyperparameters for the W2V SVM model (load or compute)
if grid_path.exists():
    grid_search = joblib.load(grid_path)
else:
    svm_W2V = make_pipeline(StandardScaler(), SVC())
    param_grid = {
        "svc__kernel": ["linear", "rbf"],
        "svc__C": [0.1, 1, 10],
        "svc__gamma": [0.001, 0.01, 0.1, 1]
    }
    grid_search = GridSearchCV(svm_W2V, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(sentence_embeddings_train, labels_train)
    joblib.dump(grid_search, grid_path)

print(grid_search.best_params_, grid_search.best_score_)

{'svc__C': 10, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'} 0.5342595557194253


### IV.b. Model training

In [259]:
(FILES_DIR / "joblib_files").mkdir(parents=True, exist_ok=True)
model_path = FILES_DIR / "joblib_files" / "svm_W2V_model.joblib"

if model_path.exists():
    svm_W2V = load(model_path)
else:
    svm_W2V = make_pipeline(
        StandardScaler(),
        SVC(C=10, gamma=0.001, kernel='rbf', decision_function_shape='ovr')
    )
    svm_W2V.fit(sentence_embeddings_train, labels_train)
    dump(svm_W2V, model_path)

### IV.c. Model testing

In [260]:
# Load the saved SVM model
svm_W2V = load(FILES_DIR / "joblib_files" / "svm_W2V_model.joblib")

In [261]:
# TESTING ON TEST SET
y_pred = svm_W2V.predict(sentence_embeddings_test)
y_test_np = y_test.cpu().numpy()
report_dict_W2V = classification_report(y_test_np, y_pred, output_dict=True)
report_W2V_SVM = pd.DataFrame(report_dict_W2V).transpose()

print("Classification Report for SVM_W2V:")
evalkit.display_report(report_W2V_SVM, sort_col='f1-score')

Classification Report for SVM_W2V:


Unnamed: 0,precision,recall,f1-score,support
0,0.8463,0.8955,0.8702,2958.0
1,0.8386,0.8482,0.8434,2984.0
2,0.8254,0.8254,0.8254,0.8254
3,0.8218,0.8254,0.8216,7040.0
4,0.7635,0.676,0.7104,7040.0
5,0.6868,0.6502,0.668,506.0
6,0.7448,0.5476,0.6311,389.0
7,0.7008,0.4384,0.5394,203.0


In [262]:
''' TESTING ON INPUT SENTENCE '''
test_sentence = "This is very negative sentence, omg, I hate it"
# test_sentence = "The price is going down, you will lose all your money"
# test_sentence = "Bitcoin price is increasing, well we have it guys, great job"
# test_sentence = "omg bro this can't be real, we lost, price is very low"

# Tokenize, clean, and convert the test sentence to a numpy array
test_sentence_token_words = preprocess_tokens(test_sentence)
test_sentence_embedding = textprep.post_to_embedding(test_sentence_token_words, word2vec)
# Predict the class
predicted_class = svm_W2V.predict([test_sentence_embedding])[0]
print(f"Input: {test_sentence}")
print(f"Predicted Class: {class_dict_bert_sentiment[predicted_class]}\n")

Input: This is very negative sentence, omg, I hate it
Predicted Class: Very positive



# V. BERT + MLP

### V.a. Multi-class classification
- target variable: `BERT-Sentiment`
- input: vector dim (128,1); sentence vector representation, BertTokenizer

In [263]:
class BERT_MLP_1L_ReLU_MC_FT(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BERT_MLP_1L_ReLU_MC_FT, self).__init__()
        self.bert = bert_model.to(device)  # Pre-trained BERT model
        hidden_size = bert_model.config.hidden_size  # Typically 768 for BERT-base

        # Define a 1-layer feedforward neural network
        self.fc1 = nn.Linear(hidden_size, num_classes).to(device)  # Output layer only

        # Activation function only, no dropout or batch normalization
        self.activation = nn.ReLU().to(device)

    def forward(self, input_ids, attention_mask):
        # Fine-tuning BERT by allowing gradient updates
        outputs = self.bert(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        pooled_output = outputs.pooler_output  # [CLS] token representation

        # Pass through the single-layer MLP
        logits = self.fc1(pooled_output)  # Direct output without dropout

        return logits

In [264]:
class BERT_MLP_3L_ReLU_MC_FT(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BERT_MLP_3L_ReLU_MC_FT, self).__init__()
        self.bert = bert_model.to(device)  # Pre-trained BERT model
        hidden_size = bert_model.config.hidden_size  # Typically 768 for BERT-base

        # Define a 3-layer feedforward neural network
        self.fc1 = nn.Linear(hidden_size, 512).to(device)  # Layer 1
        self.fc2 = nn.Linear(512, 256).to(device)         # Layer 2
        self.fc3 = nn.Linear(256, num_classes).to(device)  # Output Layer

        # Activation function and dropout
        self.activation = nn.ReLU().to(device)
        self.dropout = nn.Dropout(0.3).to(device)  # Dropout for regularization

    def forward(self, input_ids, attention_mask):
        # Fine-tuning BERT by allowing gradient updates
        outputs = self.bert(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        pooled_output = outputs.pooler_output  # [CLS] token representation

        # Pass through the 3-layer MLP
        x = self.dropout(self.activation(self.fc1(pooled_output)))  # Layer 1
        x = self.dropout(self.activation(self.fc2(x)))              # Layer 2
        logits = self.fc3(x)                                        # Output Layer

        return logits

In [265]:
class BERT_MLP_3L_ReLU_MC_BN_FT(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BERT_MLP_3L_ReLU_MC_BN_FT, self).__init__()
        self.bert = bert_model.to(device)  # Pre-trained BERT model
        hidden_size = bert_model.config.hidden_size  # Typically 768 for BERT-base

        # Define a 3-layer feedforward neural network with BatchNorm
        self.fc1 = nn.Linear(hidden_size, 512).to(device)  # Layer 1
        self.bn1 = nn.BatchNorm1d(512).to(device)          # BatchNorm after Layer 1
        self.fc2 = nn.Linear(512, 256).to(device)          # Layer 2
        self.bn2 = nn.BatchNorm1d(256).to(device)          # BatchNorm after Layer 2
        self.fc3 = nn.Linear(256, num_classes).to(device)  # Output Layer

        # Activation function and dropout
        self.activation = nn.ReLU().to(device)
        self.dropout = nn.Dropout(0.3).to(device)  # Dropout for regularization

    def forward(self, input_ids, attention_mask):
        # Fine-tuning BERT by allowing gradient updates
        outputs = self.bert(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        pooled_output = outputs.pooler_output  # [CLS] token representation

        # Pass through the 3-layer MLP with BatchNorm
        x = self.dropout(self.activation(self.bn1(self.fc1(pooled_output))))  # Layer 1 with BatchNorm
        x = self.dropout(self.activation(self.bn2(self.fc2(x))))              # Layer 2 with BatchNorm
        logits = self.fc3(x)                                                 # Output Layer

        return logits


In [266]:
class BERT_MLP_5L_ReLU_MC_BN_FT(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BERT_MLP_5L_ReLU_MC_BN_FT, self).__init__()
        self.bert = bert_model.to(device)  # Pre-trained BERT model
        hidden_size = bert_model.config.hidden_size  # Typically 768 for BERT-base

        # Define a 5-layer feedforward neural network
        self.fc1 = nn.Linear(hidden_size, 512).to(device)  # Layer 1
        self.bn1 = nn.BatchNorm1d(512).to(device)
        self.fc2 = nn.Linear(512, 256).to(device)         # Layer 2
        self.bn2 = nn.BatchNorm1d(256).to(device)
        self.fc3 = nn.Linear(256, 128).to(device)         # Layer 3
        self.bn3 = nn.BatchNorm1d(128).to(device)
        self.fc4 = nn.Linear(128, 64).to(device)          # Layer 4
        self.bn4 = nn.BatchNorm1d(64).to(device)
        self.fc5 = nn.Linear(64, num_classes).to(device)  # Output Layer

        # Activation function
        self.activation = nn.ReLU().to(device)

    def forward(self, input_ids, attention_mask):
        # Fine-tuning BERT by allowing gradient updates
        outputs = self.bert(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        pooled_output = outputs.pooler_output  # [CLS] token representation

        # Pass through the 5-layer MLP with Batch Normalization
        x = self.activation(self.bn1(self.fc1(pooled_output)))  # Layer 1 with BN
        x = self.activation(self.bn2(self.fc2(x)))              # Layer 2 with BN
        x = self.activation(self.bn3(self.fc3(x)))              # Layer 3 with BN
        x = self.activation(self.bn4(self.fc4(x)))              # Layer 4 with BN
        logits = self.fc5(x)                                    # Output Layer (logits)

        return logits


In [267]:
class BERT_MLP_5L_ReLU_MC_DO30_FT(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BERT_MLP_5L_ReLU_MC_DO30_FT, self).__init__()
        self.bert = bert_model.to(device)  # Pre-trained BERT model
        hidden_size = bert_model.config.hidden_size  # Typically 768 for BERT-base

        # Define a 5-layer feedforward neural network
        self.fc1 = nn.Linear(hidden_size, 512).to(device)  # Layer 1
        self.fc2 = nn.Linear(512, 256).to(device)         # Layer 2
        self.fc3 = nn.Linear(256, 128).to(device)         # Layer 3
        self.fc4 = nn.Linear(128, 64).to(device)          # Layer 4
        self.fc5 = nn.Linear(64, num_classes).to(device)  # Output Layer

        # Activation function and dropout
        self.activation = nn.ReLU().to(device)
        self.dropout = nn.Dropout(0.3).to(device)  # Dropout for regularization

    def forward(self, input_ids, attention_mask):
        # Fine-tuning BERT by allowing gradient updates
        outputs = self.bert(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        pooled_output = outputs.pooler_output  # [CLS] token representation

        # Pass through the 5-layer MLP
        x = self.dropout(self.activation(self.fc1(pooled_output)))  # Layer 1
        x = self.dropout(self.activation(self.fc2(x)))              # Layer 2
        x = self.dropout(self.activation(self.fc3(x)))              # Layer 3
        x = self.dropout(self.activation(self.fc4(x)))              # Layer 4
        logits = self.fc5(x)                                        # Output Layer

        return logits


### V.b. Model training

In [268]:
# Define models
models = [
    # BERT_MLP_1L_ReLU_MC_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_3L_ReLU_MC_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_3L_ReLU_MC_BN_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_5L_ReLU_MC_BN_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_5L_ReLU_MC_DO30_FT(bert_model, num_classes=5).to(device),
]

for param in bert_model.parameters():
    param.requires_grad = True

# Initialize TensorBoard writer
(FILES_DIR / "pth_files").mkdir(parents=True, exist_ok=True)
(TSB_DIR / "BERT_FT").mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(log_dir=str(TSB_DIR / "BERT_FT"))

# Define loss function
criterion = nn.CrossEntropyLoss().to(device)

# Unfreeze BERT parameters and set parameter groups
def get_optimizer(model):
    """
    Creates parameter groups so BERT layers get a smaller learning rate,
    while the classifier layers get a slightly higher learning rate.
    """
    for name, param in model.bert.named_parameters():
        param.requires_grad = True

    bert_params = []
    classifier_params = []
    for name, param in model.named_parameters():
        if "bert" in name:
            bert_params.append(param)
        else:
            classifier_params.append(param)

    optimizer = optim.AdamW(
        [
            {"params": bert_params, "lr": 2e-5, "weight_decay": 1e-5},
            {"params": classifier_params, "lr": 1e-4, "weight_decay": 1e-5},
        ]
    )
    return optimizer

# Validation function
def validate_model(model, dataloader, criterion, device):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            val_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    avg_val_loss = val_loss / len(dataloader)
    val_accuracy = correct / total
    return avg_val_loss, val_accuracy

# Training loop with TensorBoard and early stopping
epochs = 5
patience = 2

for model in models:
    model_name = model.__class__.__name__
    print(f"Training {model_name}")
    optimizer = get_optimizer(model)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
    best_val_loss = float('inf')
    no_improvement_count = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for input_ids, attention_mask, labels in tqdm(train_dataloader_bert, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader_bert)
        val_loss, val_accuracy = validate_model(model, val_dataloader_bert, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch [{epoch+1}/{epochs}] Train loss: {avg_train_loss:.4f}  Validation loss: {val_loss:.4f}  Validation accuracy: {val_accuracy:.4f}")
        writer.add_scalar(f"{model_name}/Train_Loss", avg_train_loss, epoch)
        writer.add_scalar(f"{model_name}/Val_Loss", val_loss, epoch)
        writer.add_scalar(f"{model_name}/Val_Accuracy", val_accuracy, epoch)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improvement_count = 0
            torch.save(model.state_dict(), FILES_DIR / "pth_files" / f"{model_name}_best_weights.pth")
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print("Early stopping triggered.")
                break

    model.load_state_dict(torch.load(FILES_DIR / "pth_files" / f"{model_name}_best_weights.pth", map_location=device))
    test_loss, test_accuracy = validate_model(model, test_dataloader_bert, criterion, device)
    print(f"Test loss: {test_loss:.4f}, test accuracy: {test_accuracy:.4f}\n")
    torch.save(model.state_dict(), FILES_DIR / "pth_files" / f"{model_name}_final_weights.pth")
    print(f"Saved final weights for {model_name}.")

writer.close()


### V.c. Model testing

In [269]:
''' CHOOSE MODEL TO TEST '''
models = [
    # BERT_MLP_1L_ReLU_MC_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_3L_ReLU_MC_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_3L_ReLU_MC_BN_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_5L_ReLU_MC_BN_FT(bert_model, num_classes=5).to(device),
    # BERT_MLP_5L_ReLU_MC_DO30_FT(bert_model, num_classes=5).to(device),
]

# Load the weights
for model in models:
    _best = FILES_DIR / "pth_files" / f"{model.__class__.__name__}_best_weights.pth"
    _std  = FILES_DIR / "pth_files" / f"{model.__class__.__name__}_weights.pth"
    _path = _best if _best.exists() else _std
    model.load_state_dict(torch.load(_path, map_location=device, weights_only=False))
    model.eval()

In [270]:
''' TESTING ON VALIDATION AND TEST SETS '''

# Pack inputs for BERT models (expects dict with input_ids & attention_mask)
X_val = {"input_ids": X_bert_val["input_ids"], "attention_mask": X_bert_val["attention_mask"]}
X_test = {"input_ids": X_bert_test["input_ids"], "attention_mask": X_bert_test["attention_mask"]}

for model in models:
    print(f"\n--- {model.__class__.__name__} ---")
    evalkit.evaluate(model, X_val, y_val, dataset_name="Validation", device=device)
    evalkit.evaluate(model, X_test, y_test, dataset_name="Test", device=device)

In [271]:
''' TESTING ON INPUT SENTENCE'''
test_sentence = "This is very negative sentence, omg, I hate it"
# test_sentence = "The price is going down, you will lose all your money"
# test_sentence = "Bitcoin is going up guys, we will be rich, nice"
# test_sentence = "omg bro this can't be real, we lost, price is very low"

encoded_input = textprep.bert_tokenize_function(test_sentence, bert_tokenizer)
input_ids = encoded_input["input_ids"].to(device)
attention_mask = encoded_input["attention_mask"].to(device)

for model in models:
    print(f"\n--- {model.__class__.__name__} ---")
    model.eval()
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(torch.softmax(logits, dim=1), dim=1).item()
    sentiment = class_dict_bert_sentiment[pred]
    print(f"Predicted Sentiment: {sentiment}")


# V. BERT + SVM

In [272]:
# Load the pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

### V.a. Hyperparameters tuning

In [273]:
pkl_dir = FILES_DIR / "pkl_files"
pkl_dir.mkdir(parents=True, exist_ok=True)
grid_path = pkl_dir / "svm_BERT_grid_search.pkl"

# Find the best hyperparameters for the BERT SVM model (load or compute)
if grid_path.exists():
    grid_search_BERT = joblib.load(grid_path)
else:
    svm_BERT = make_pipeline(StandardScaler(), SVC())
    param_grid = {
        "svc__kernel": ["rbf"],
        "svc__C": [0.1, 1, 10],
        "svc__gamma": [0.001, 0.01, 0.1, 1],
    }
    grid_search_BERT = GridSearchCV(svm_BERT, param_grid, cv=5, scoring="f1_macro")
    grid_search_BERT.fit(X_bert_post_embeddings_train, y_train.cpu().numpy())
    joblib.dump(grid_search_BERT, grid_path)

print(grid_search_BERT.best_params_, grid_search_BERT.best_score_)

{'svc__C': 10, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'} 0.2034304271348117


### V.b. Model training

In [274]:
(FILES_DIR / "joblib_files").mkdir(parents=True, exist_ok=True)
model_path = FILES_DIR / "joblib_files" / "svm_BERT_model.joblib"

if model_path.exists():
    svm_BERT = load(model_path)
else:
    svm_BERT = make_pipeline(
        StandardScaler(),
        SVC(C=10, gamma=0.001, kernel='rbf', decision_function_shape='ovr')
    )
    svm_BERT.fit(X_bert_post_embeddings_train, y_train.cpu().numpy())
    dump(svm_BERT, model_path)

### V.c. Model testing

In [275]:
# Load the saved SVM model
svm_BERT = load(FILES_DIR / "joblib_files" / "svm_BERT_model.joblib")

In [276]:
# TESTING ON TEST SET
y_pred = svm_BERT.predict(X_bert_post_embeddings_test)
y_test_np = y_test.cpu().numpy()
report_dict_BERT = classification_report(y_test_np, y_pred, output_dict=True)
report_BERT_SVM = pd.DataFrame(report_dict_BERT).transpose()

print("Classification Report for SVM_BERT:")
evalkit.display_report(report_BERT_SVM, sort_col='f1-score')

Classification Report for SVM_BERT:


Unnamed: 0,precision,recall,f1-score,support
0,0.4563,0.5235,0.4876,2984.0
1,0.4513,0.5237,0.4848,2958.0
2,0.4433,0.4433,0.4433,0.4433
3,0.3914,0.4433,0.4128,7040.0
4,0.2108,0.2142,0.2027,7040.0
5,0.0778,0.0138,0.0235,506.0
6,0.04,0.0049,0.0088,203.0
7,0.0286,0.0051,0.0087,389.0


In [277]:
''' TESTING ON INPUT SENTENCE '''
test_sentence = "This is very negative sentence, omg, I hate it"
# test_sentence = "The price is going down, you will lose all your money"
# test_sentence = "Bitcoin price is increasing, well we have it guys, great job"
# test_sentence = "omg bro this can't be real, we lost, price is very low"

# Get BERT embeddings for the test sentence
test_sentence_token_words = textprep.bert_tokenize_function(test_sentence, bert_tokenizer)
test_sentence_embedding = textprep.get_bert_embedding(test_sentence_token_words, model=bert_model, device=device)

# Predict the class
predicted_class = svm_BERT.predict(test_sentence_embedding)[0]
print(f"Input: {test_sentence}")
print(f"Predicted Class: {class_dict_bert_sentiment[predicted_class]}\n")

Input: This is very negative sentence, omg, I hate it
Predicted Class: Very negative



# VI. Evaluation

### VI.a. Classification reports generation 

In [278]:
''' GENERATE CLASSIFICATION REPORTS FOR ALL W2V SVM MODELS '''
reports_dir = FILES_DIR / "reports" / "classification_reports_w2v_svm"
reports_dir.mkdir(parents=True, exist_ok=True)
grid_search = joblib.load(FILES_DIR / "pkl_files" / "svm_W2V_grid_search.pkl")

summary_csv = reports_dir / "W2V_SVM_summary_classification_report.csv"
if summary_csv.exists():
    summary_df = pd.read_csv(summary_csv)
else:
    summary_df = evalkit.generate_svm_reports(
        X_train=sentence_embeddings_train,
        y_train=y_train,
        X_test=sentence_embeddings_test,
        y_test=y_test,
        output_dir=reports_dir,
        prefix="W2V_SVM",
        grid_search=grid_search
    )

evalkit.display_report(summary_df, sort_col="Weighted Avg F1-Score")

Unnamed: 0,Model,Accuracy,Macro Avg F1-Score,Weighted Avg F1-Score
0,W2V_SVM_C_10_gamma_0.001_kernel_rbf.csv,0.7679,0.5677,0.7613
1,W2V_SVM_C_10_gamma_0.01_kernel_rbf.csv,0.7744,0.5551,0.7579
2,W2V_SVM_C_1_gamma_0.01_kernel_rbf.csv,0.7591,0.5183,0.7387
3,W2V_SVM_C_1_gamma_0.001_kernel_rbf.csv,0.7438,0.488,0.7247
4,W2V_SVM_C_0.1_gamma_1_kernel_linear.csv,0.7236,0.5152,0.7151
5,W2V_SVM_C_0.1_gamma_0.001_kernel_linear.csv,0.7236,0.5152,0.7151
6,W2V_SVM_C_0.1_gamma_0.1_kernel_linear.csv,0.7236,0.5152,0.7151
7,W2V_SVM_C_0.1_gamma_0.01_kernel_linear.csv,0.7236,0.5152,0.7151
8,W2V_SVM_C_1_gamma_0.001_kernel_linear.csv,0.7182,0.5058,0.7103
9,W2V_SVM_C_1_gamma_0.01_kernel_linear.csv,0.7182,0.5058,0.7103


In [279]:
''' GENERATE CLASSIFICATION REPORTS FOR ALL BERT SVM MODELS '''
reports_dir = FILES_DIR / "reports" / "classification_reports_bert_svm"
reports_dir.mkdir(parents=True, exist_ok=True)
grid_search = joblib.load(FILES_DIR / "pkl_files" / "svm_BERT_grid_search.pkl")

summary_csv = reports_dir / "BERT_SVM_summary_classification_report.csv"
if summary_csv.exists():
    summary_df = pd.read_csv(summary_csv)
else:
    summary_df = evalkit.generate_svm_reports(
        X_train=X_bert_post_embeddings_train,
        y_train=y_train,
        X_test=X_bert_post_embeddings_test,
        y_test=y_test,
        output_dir=reports_dir,
        prefix="BERT_SVM",
        grid_search=grid_search
    )

evalkit.display_report(summary_df, sort_col="Weighted Avg F1-Score")

Unnamed: 0,Model,Accuracy,Macro Avg F1-Score,Weighted Avg F1-Score
0,BERT_SVM_C_1_gamma_0.001_kernel_rbf.csv,0.4538,0.1968,0.4152
1,BERT_SVM_C_10_gamma_0.001_kernel_rbf.csv,0.4433,0.2027,0.4128
2,BERT_SVM_C_1_gamma_0.01_kernel_rbf.csv,0.4536,0.1953,0.4024
3,BERT_SVM_C_10_gamma_0.01_kernel_rbf.csv,0.4449,0.1976,0.4006
4,BERT_SVM_C_0.1_gamma_0.001_kernel_rbf.csv,0.455,0.1883,0.3976
5,BERT_SVM_C_10_gamma_0.1_kernel_rbf.csv,0.452,0.1924,0.3893
6,BERT_SVM_C_1_gamma_0.1_kernel_rbf.csv,0.4516,0.1911,0.3877
7,BERT_SVM_C_1_gamma_1_kernel_rbf.csv,0.4501,0.1889,0.3832
8,BERT_SVM_C_10_gamma_1_kernel_rbf.csv,0.4497,0.1889,0.3832
9,BERT_SVM_C_0.1_gamma_0.1_kernel_rbf.csv,0.4547,0.1656,0.35


In [280]:
''' GENERATE CLASSIFICATION REPORTS FOR ALL W2V MLP MODELS '''
models = [
    ("W2V_MLP_1L_ReLU_MC", W2V_MLP_1L_ReLU_MC().to(device)),
    ("W2V_MLP_3L_ReLU_MC", W2V_MLP_3L_ReLU_MC().to(device)),
    ("W2V_MLP_3L_ReLU_MC_DO30", W2V_MLP_3L_ReLU_MC_DO30().to(device)),
    ("W2V_MLP_5L_ReLU_MC", W2V_MLP_5L_ReLU_MC().to(device)),
    ("W2V_MLP_5L_ReLU_MC_BN2", W2V_MLP_5L_ReLU_MC_BN2().to(device)),
    ("W2V_MLP_5L_ReLU_MC_BN4", W2V_MLP_5L_ReLU_MC_BN4().to(device)),
]

reports_dir = FILES_DIR / "reports" / "classification_reports_w2v_mlp"
reports_dir.mkdir(parents=True, exist_ok=True)
summary_csv = reports_dir.parent / "W2V_MLP_summary_classification_report.csv"

if summary_csv.exists():
    summary_df = pd.read_csv(summary_csv)
else:
    summary_df = evalkit.generate_mlp_reports(
        models=models,
        dataloader=test_dataloader,
        device=device,
        output_dir=reports_dir,
        prefix="W2V_MLP",
        weight_dir=FILES_DIR / "pth_files"
    )

evalkit.display_report(summary_df, sort_col="Weighted Avg F1-Score")  

Unnamed: 0,Model,Accuracy,Macro Avg F1-Score,Weighted Avg F1-Score
0,W2V_MLP_5L_ReLU_MC_BN4,0.7747,0.5215,0.7579
1,W2V_MLP_5L_ReLU_MC_BN2,0.752,0.5044,0.7373
2,W2V_MLP_3L_ReLU_MC_DO30,0.7138,0.3097,0.6535
3,W2V_MLP_5L_ReLU_MC,0.706,0.3063,0.6463
4,W2V_MLP_3L_ReLU_MC,0.7048,0.3058,0.6452
5,W2V_MLP_1L_ReLU_MC,0.6798,0.2951,0.6226


In [281]:
''' GENERATE CLASSIFICATION REPORTS FOR ALL BERT MLP MODELS '''
models = [
    ("BERT_MLP_1L_ReLU_MC_FT",  BERT_MLP_1L_ReLU_MC_FT(bert_model, num_classes=5).to(device)),
    ("BERT_MLP_3L_ReLU_MC_FT",  BERT_MLP_3L_ReLU_MC_FT(bert_model, num_classes=5).to(device)),
    ("BERT_MLP_3L_ReLU_MC_BN_FT", BERT_MLP_3L_ReLU_MC_BN_FT(bert_model, num_classes=5).to(device)),
    ("BERT_MLP_5L_ReLU_MC_BN_FT", BERT_MLP_5L_ReLU_MC_BN_FT(bert_model, num_classes=5).to(device)),
]

reports_dir = FILES_DIR / "reports" / "classification_reports_bert_mlp"
reports_dir.mkdir(parents=True, exist_ok=True)
summary_csv = reports_dir / "BERT_MLP_summary_classification_report.csv"

if summary_csv.exists():
    summary_df = pd.read_csv(summary_csv)
else:
    summary_df = evalkit.generate_mlp_reports(
        models=models,
        dataloader=test_dataloader_bert,
        device=device,
        output_dir=reports_dir,
        prefix="BERT_MLP",
        weight_dir=FILES_DIR / "pth_files"
    )

evalkit.display_report(summary_df, sort_col="Weighted Avg F1-Score")  

Unnamed: 0,Model,Accuracy,Macro Avg F1-Score,Weighted Avg F1-Score
0,BERT_MLP_3L_ReLU_MC_FT,0.8513,0.7324,0.8498
1,BERT_MLP_5L_ReLU_MC_BN_FT,0.8528,0.7268,0.8494
2,BERT_MLP_3L_ReLU_MC_BN_FT,0.8322,0.7193,0.8304
3,BERT_MLP_1L_ReLU_MC_FT,0.822,0.6279,0.8102


### VI.b. Architectures comparison

In [283]:
# Build & display the table using your FILES_DIR
comparison_df = evalkit.build_best_models_comparison(FILES_DIR, headers="simplified")
evalkit.display_comparison(comparison_df, center_headers=True)

# Save the comparison DataFrame to CSV
(comparison_df.to_csv(FILES_DIR / "reports" / "best_models_per_class_comparison.csv"))

Unnamed: 0_level_0,Support,W2V_SVM,W2V_SVM,W2V_SVM,W2V_MLP,W2V_MLP,W2V_MLP,BERT_SVM,BERT_SVM,BERT_SVM,BERT_MLP,BERT_MLP,BERT_MLP
Unnamed: 0_level_1,Unnamed: 1_level_1,Precision,Recall,F1-Score,Precision,Recall,F1-Score,Precision,Recall,F1-Score,Precision,Recall,F1-Score
0,2984.0,0.8,0.83,0.81,0.81,0.82,0.82,0.45,0.56,0.5,0.85,0.9,0.88
1,506.0,0.48,0.44,0.46,0.51,0.53,0.52,0.0,0.0,0.0,0.67,0.73,0.7
2,203.0,0.32,0.2,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.65,0.45,0.53
3,389.0,0.57,0.43,0.49,0.65,0.33,0.44,0.0,0.0,0.0,0.71,0.62,0.66
4,2958.0,0.82,0.85,0.83,0.79,0.88,0.83,0.45,0.51,0.48,0.91,0.88,0.89
Average,,0.6,0.55,0.57,0.55,0.51,0.52,0.18,0.22,0.2,0.76,0.72,0.73
Weighted Average,,0.76,0.77,0.76,0.75,0.77,0.76,0.38,0.45,0.42,0.85,0.85,0.85
