# Model Training & Hyperparameter Tuning

This notebook focuses on training different models for sentiment classification:

1. Baseline Models:
   - Logistic Regression
   - Support Vector Machine (SVM)
   - Random Forest

2. Deep Learning Models:
   - BiLSTM
   - BERT fine-tuning

3. Hyperparameter Tuning:
   - Grid Search for baseline models
   - Optuna for deep learning models

We will train each model with different feature representations and save the best models.

## Setup and Imports

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1


In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
import scipy.sparse as sp

# Deep learning libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW

# Hyperparameter optimization
import optuna

# Visualization settings
plt.style.use('ggplot')
sns.set(style='whitegrid')
%matplotlib inline

## Load Features and Prepare Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data'
features_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features'
models_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/models'
results_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/results'

# Load pre-split datasets
train_df = pd.read_csv(os.path.join(data_dir, 'twitter_training_clean.csv'))
val_df = pd.read_csv(os.path.join(data_dir, 'twitter_validation_clean.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'twitter_testing_clean.csv'))

# Extract labels
y_train = train_df['sentiment'].values
y_val = val_df['sentiment'].values
y_test = test_df['sentiment'].values

# Define a function to load features
def load_features(feature_type, feature_dir):
  if feature_type in ['bow', 'tfidf']:
    return (
      sp.load_npz(os.path.join(feature_dir, f'{feature_type}_features_train.npz')),
      sp.load_npz(os.path.join(feature_dir, f'{feature_type}_features_val.npz')),
      sp.load_npz(os.path.join(feature_dir, f'{feature_type}_features_test.npz'))
    )
  else:
    return (
      np.load(os.path.join(feature_dir, f'{feature_type}_features_train.npy')),
      np.load(os.path.join(feature_dir, f'{feature_type}_features_val.npy')),
      np.load(os.path.join(feature_dir, f'{feature_type}_features_test.npy'))
    )

# Load features using the function
X_bow_train, X_bow_val, X_bow_test = load_features('bow', features_dir)
X_tfidf_train, X_tfidf_val, X_tfidf_test = load_features('tfidf', features_dir)
X_word2vec_train, X_word2vec_val, X_word2vec_test = load_features('word2vec', features_dir)
X_glove_train, X_glove_val, X_glove_test = load_features('glove', features_dir)
X_bert_train, X_bert_val, X_bert_test = load_features('bert', features_dir)
print("Features loaded successfully.")


# Load the label encoder first (add this with your other imports)
with open(os.path.join(models_dir, 'label_encoder.pkl'), 'rb') as f:
    label_encoder = pickle.load(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Features loaded successfully.


## Baseline Models
For baseline models, we focus on comparing all five features to understand the difference. We also make use of grid_search to play around different set of hyperparameters to compare.

In [None]:
# Train and evaluate a model
def train_evaluate_model(model, X_train, X_val, y_train, y_val, model_name, feature_name):
    # Train model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_val)

    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')

    # Print results
    print(f"{model_name} with {feature_name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

    return {
        'model': model,
        'accuracy': accuracy,
        'f1_score': f1,
        'model_name': model_name,
        'feature_name': feature_name
    }

# Function to run GridSearchCV
def grid_search_model(model, param_grid, X_train, y_train, model_name):
    # Set up GridSearchCV
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1, verbose=1)

    # Train model with grid search
    grid_search.fit(X_train, y_train)

    # Print results
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

    return grid_search.best_estimator_

### 4.1 Logistic Regression

In [None]:
# Dictionary to store all results
lr_result = []

# Logistic Regression parameters after grid search
# larger c to reduce regularization, minimize the diverge issue
# liblinear's performance is better than saga
# large max interation is used to minimize the diverge issue
lr_param_grid = {
    'C': [100],
    'solver': ['liblinear'],
    'max_iter': [2000]
}

# For BoW features
print("Training Logistic Regression with BoW features...")
lr_bow = grid_search_model(LogisticRegression(), lr_param_grid, X_bow_train, y_train, "Logistic Regression (BoW)")
lr_bow_result = train_evaluate_model(lr_bow, X_bow_train, X_bow_val, y_train, y_val, "Logistic Regression", "BoW")
lr_result.append(lr_bow_result)

# For TF-IDF features
print("\nTraining Logistic Regression with TF-IDF features...")
lr_tfidf = grid_search_model(LogisticRegression(), lr_param_grid, X_tfidf_train, y_train, "Logistic Regression (TF-IDF)")
lr_tfidf_result = train_evaluate_model(lr_tfidf, X_tfidf_train, X_tfidf_val, y_train, y_val, "Logistic Regression", "TF-IDF")
lr_result.append(lr_tfidf_result)

# For Word2Vec features
print("\nTraining Logistic Regression with Word2Vec features...")
lr_w2v = grid_search_model(LogisticRegression(), lr_param_grid, X_word2vec_train, y_train, "Logistic Regression (Word2Vec)")
lr_w2v_result = train_evaluate_model(lr_w2v, X_word2vec_train, X_word2vec_val, y_train, y_val, "Logistic Regression", "Word2Vec")
lr_result.append(lr_w2v_result)

# For GloVe features
print("\nTraining Logistic Regression with GloVe features...")
lr_glove = grid_search_model(LogisticRegression(), lr_param_grid, X_glove_train, y_train, "Logistic Regression (GloVe)")
lr_glove_result = train_evaluate_model(lr_glove, X_glove_train, X_glove_val, y_train, y_val, "Logistic Regression", "GloVe")
lr_result.append(lr_glove_result)

# For BERT features
print("\nTraining Logistic Regression with BERT features...")
lr_bert = grid_search_model(LogisticRegression(), lr_param_grid, X_bert_train, y_train, "Logistic Regression (BERT)")
lr_bert_result = train_evaluate_model(lr_bert, X_bert_train, X_bert_val, y_train, y_val, "Logistic Regression", "BERT")
lr_result.append(lr_bert_result)

# Save the results to a file
lr_results_file = os.path.join(results_dir, 'logistic_regression_results.pkl')
with open(lr_results_file, 'wb') as f:
    pickle.dump(lr_result, f)
print(f"Logistic Regression results saved to {lr_results_file}")

# Find the best performing model
best_svm_model = max(svm_result, key=lambda x: x['f1_score'])
best_svm_model_file = os.path.join(models_dir, f"best_svm_{best_svm_model['feature_name'].lower()}.pkl")
with open(best_svm_model_file, 'wb') as f:
    pickle.dump(best_svm_model['model'], f)
print(f"Best SVM model saved to {best_svm_model_file}")

Training Logistic Regression with BoW features...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters for Logistic Regression (BoW): {'C': 100, 'max_iter': 2000, 'solver': 'liblinear'}
Best cross-validation score: 0.7674
Logistic Regression with BoW - Accuracy: 0.7573, F1 Score: 0.7574

Training Logistic Regression with TF-IDF features...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters for Logistic Regression (TF-IDF): {'C': 100, 'max_iter': 2000, 'solver': 'liblinear'}
Best cross-validation score: 0.7641
Logistic Regression with TF-IDF - Accuracy: 0.7554, F1 Score: 0.7556

Training Logistic Regression with Word2Vec features...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters for Logistic Regression (Word2Vec): {'C': 100, 'max_iter': 2000, 'solver': 'liblinear'}
Best cross-validation score: 0.4074
Logistic Regression with Word2Vec - Accuracy: 0.4119, F1 Score: 0.4132

Training Logistic Regression with GloVe fea

### Support Vector Machine (SVM)

In [None]:
# Dictionary to store all results
svm_result = []

# SVM parameters after grid search
svm_param_grid = {
    'C': [1],
    'kernel': ['linear'],
    'gamma': ['scale']
}

# For BoW features
print("Training SVM with BoW features...")
svm_bow = grid_search_model(SVC(), svm_param_grid, X_bow_train, y_train, "SVM (BoW)")
svm_bow_result = train_evaluate_model(svm_bow, X_bow_train, X_bow_val, y_train, y_val, "SVM", "BoW")
svm_result.append(svm_bow_result)

# For TF-IDF features
print("\nTraining SVM with TF-IDF features...")
svm_tfidf = grid_search_model(SVC(), svm_param_grid, X_tfidf_train, y_train, "SVM (TF-IDF)")
svm_tfidf_result = train_evaluate_model(svm_tfidf, X_tfidf_train, X_tfidf_val, y_train, y_val, "SVM", "TF-IDF")
svm_result.append(svm_tfidf_result)

# For Word2Vec features
print("\nTraining SVM with Word2Vec features...")
svm_w2v = grid_search_model(SVC(), svm_param_grid, X_word2vec_train, y_train, "SVM (Word2Vec)")
svm_w2v_result = train_evaluate_model(svm_w2v, X_word2vec_train, X_word2vec_val, y_train, y_val, "SVM", "Word2Vec")
svm_result.append(svm_w2v_result)

# For GloVe features
print("\nTraining SVM with GloVe features...")
svm_glove = grid_search_model(SVC(), svm_param_grid, X_glove_train, y_train, "SVM (GloVe)")
svm_glove_result = train_evaluate_model(svm_glove, X_glove_train, X_glove_val, y_train, y_val, "SVM", "GloVe")
svm_result.append(svm_glove_result)

# For BERT features
print("\nTraining SVM with BERT features...")
svm_bert = grid_search_model(SVC(), svm_param_grid, X_bert_train, y_train, "SVM (BERT)")
svm_bert_result = train_evaluate_model(svm_bert, X_bert_train, X_bert_val, y_train, y_val, "SVM", "BERT")
svm_result.append(svm_bert_result)


# Save the results to a file
svm_results_file = os.path.join(results_dir, 'svm_results.pkl')
with open(svm_results_file, 'wb') as f:
    pickle.dump(svm_result, f)
print(f"SVM results saved to {svm_results_file}")

# Find the best performing model
best_svm_model = max(svm_result, key=lambda x: x['f1_score'])
best_svm_model_file = os.path.join(models_dir, f"best_svm_{best_svm_model['feature_name'].lower()}.pkl")
with open(best_svm_model_file, 'wb') as f:
    pickle.dump(best_svm_model['model'], f)
print(f"Best SVM model saved to {best_svm_model_file}")

### Random Forest

In [None]:
rf_result = []

# RF parameters after grid search
best_params = {
    'n_estimators': 200,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1
}

# BoW features
print("Training Random Forest with BoW features...")
rf_model_bow = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
)
rf_bow_result = train_evaluate_model(
    model=rf_model_bow,
    X_train=X_bow_train,
    X_val=X_bow_val,be
    y_train=y_train,
    y_val=y_val,
    model_name="Random Forest",
    feature_name="BoW"
)
rf_result.append(rf_bow_result)


# TF-IDF features
print("\nTraining Random Forest with TF-IDF features...")
rf_model_tfidf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
)
rf_tfidf_result = train_evaluate_model(
    model=rf_model_tfidf,
    X_train=X_tfidf_train,
    X_val=X_tfidf_val,
    y_train=y_train,
    y_val=y_val,
    model_name="Random Forest",
    feature_name="TF-IDF"
)
rf_result.append(rf_tfidf_result)


# Word2Vec features
print("\nTraining Random Forest with Word2Vec features...")
rf_model_w2v = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
)
rf_w2v_result = train_evaluate_model(
    model=rf_model_w2v,
    X_train=X_word2vec_train,
    X_val=X_word2vec_val,
    y_train=y_train,
    y_val=y_val,
    model_name="Random Forest",
    feature_name="Word2Vec"
)
rf_result.append(rf_w2v_result)


# GloVe features
print("\nTraining Random Forest with GloVe features...")
rf_model_glove = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
)
rf_glove_result = train_evaluate_model(
    model=rf_model_glove,
    X_train=X_glove_train,
    X_val=X_glove_val,
    y_train=y_train,
    y_val=y_val,
    model_name="Random Forest",
    feature_name="GloVe"
)
rf_result.append(rf_glove_result)


# BERT features
print("\nTraining Random Forest with BERT features...")
rf_model_bert = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
)
rf_bert_result = train_evaluate_model(
    model=rf_model_bert,
    X_train=X_bert_train,
    X_val=X_bert_val,
    y_train=y_train,
    y_val=y_val,
    model_name="Random Forest",
    feature_name="BERT"
)
rf_result.append(rf_bert_result)

print("\nFinished training Random Forest models for all feature types.")

# Save the results to a file
rf_results_file = os.path.join(results_dir, 'rf_results.pkl')
with open(rf_results_file, 'wb') as f:
    pickle.dump(rf_result, f)
print(f"Random Forest results saved to {rf_results_file}")

# Find the best performing model
best_rf_model = max(rf_result, key=lambda x: x['f1_score'])
best_rf_model_file = os.path.join(models_dir, f"best_rf_{best_rf_model['feature_name'].lower()}.pkl")
with open(best_rf_model_file, 'wb') as f:
    pickle.dump(best_rf_model['model'], f)
print(f"Best Random Forest model saved to {best_rf_model_file}")

## Deep Learning Models

### BiLSTM Model

In [None]:
# Define BiLSTM model
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        # text shape: [batch size, input dim]
        # We need to add sequence length dimension for LSTM
        text = text.unsqueeze(1)  # Now: [batch size, 1, input dim]

        output, (hidden, cell) = self.lstm(text)

        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

        return self.fc(hidden)

# Dataset class for PyTorch
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.FloatTensor(embeddings)
        # Convert string labels to numeric using the saved encoder
        numeric_labels = label_encoder.transform(labels)
        self.labels = torch.LongTensor(numeric_labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Function to train BiLSTM
def train_bilstm(X_train, X_val, y_train, y_val, embedding_type, n_epochs=10):
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Create datasets and data loaders
    train_dataset = EmbeddingDataset(X_train, y_train)
    val_dataset = EmbeddingDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)

    # Create model
    input_dim = X_train.shape[1]
    hidden_dim = 128
    output_dim = len(np.unique(y_train))
    n_layers = 2
    dropout = 0.5

    model = BiLSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
    model = model.to(device)

    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    best_val_loss = float('inf')
    best_model = None

    for epoch in range(n_epochs):
        # Training
        model.train()
        train_loss = 0
        train_acc = 0

        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(embeddings)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(predictions, 1)
            train_acc += (predicted == labels).sum().item() / len(labels)

        train_loss /= len(train_loader)
        train_acc /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0

        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                predictions = model(embeddings)
                loss = criterion(predictions, labels)

                val_loss += loss.item()
                _, predicted = torch.max(predictions, 1)
                val_acc += (predicted == labels).sum().item() / len(labels)

        val_loss /= len(val_loader)
        val_acc /= len(val_loader)

        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()

    # Load best model
    model.load_state_dict(best_model)

    # Evaluate on validation set
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for embeddings, labels in val_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            predictions = model(embeddings)
            _, predicted = torch.max(predictions, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"BiLSTM with {embedding_type} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

    # Save model
    torch.save(model.state_dict(), os.path.join(models_dir, f'bilstm_{embedding_type.lower()}.pt'))

    return {
        'model': model,
        'accuracy': accuracy,
        'f1_score': f1,
        'model_name': 'BiLSTM',
        'feature_name': embedding_type
    }

In [None]:
# Train BiLSTM with Word2Vec embeddings
print("Training BiLSTM with Word2Vec embeddings...")
bilstm_w2v_result = train_bilstm(X_word2vec_train, X_word2vec_val, y_train, y_val, "Word2Vec")

# Train BiLSTM with GloVe embeddings
print("\nTraining BiLSTM with GloVe embeddings...")
bilstm_glove_result = train_bilstm(X_glove_train, X_glove_val, y_train, y_val, "GloVe")

# Train BiLSTM with BERT embeddings
print("\nTraining BiLSTM with BERT embeddings...")
bilstm_bert_result = train_bilstm(X_bert_train, X_bert_val, y_train, y_val, "BERT")

### Hyperparameter Tuning with Optuna for BiLSTM (BERT Embeddings only)

In [None]:
# Hyperparameter tuning for BiLSTM using Optuna
def objective(trial, X_train, X_val, y_train, y_val):
    # Define hyperparameters to optimize
    hidden_dim = trial.suggest_int('hidden_dim', 64, 256, step=64)
    n_layers = trial.suggest_int('n_layers', 1, 3)
    dropout = trial.suggest_float('dropout', 0.1, 0.5, step=0.1) if n_layers > 1 else 0.0
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])

    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Create datasets and data loaders
    train_dataset = EmbeddingDataset(X_train, y_train)
    val_dataset = EmbeddingDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Create model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y_train))

    model = BiLSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
    model = model.to(device)

    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Early stopping parameters
    patience = 3
    best_val_loss = float('inf')
    counter = 0

    # Training loop
    n_epochs = 10  # Max epochs for tuning

    for epoch in range(n_epochs):
        # Training
        model.train()
        train_loss = 0

        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(embeddings)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                predictions = model(embeddings)
                loss = criterion(predictions, labels)

                val_loss += loss.item()
                _, predicted = torch.max(predictions, 1)
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_loss /= len(val_loader)
        f1 = f1_score(all_labels, all_preds, average='weighted')

        # Report score to Optuna
        trial.report(f1, epoch)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                break

        # Handle pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return f1

# Run optimization for BiLSTM with Word2Vec embeddings
def optimize_bilstm(X_train, X_val, y_train, y_val, embedding_type, n_trials=20):
    print(f"\nOptimizing BiLSTM with {embedding_type} embeddings...")

    # Create study
    study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())

    # Run optimization
    study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val), n_trials=n_trials)

    # Print results
    print(f"\nBest trial for BiLSTM with {embedding_type}:")
    trial = study.best_trial
    print(f"  F1 Score: {trial.value:.4f}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    return study.best_params

best_params_bert = optimize_bilstm(X_bert_train, X_bert_val, y_train, y_val, "BERT", n_trials=10)

In [None]:
# Function to train BiLSTM with optimized hyperparameters
def train_bilstm_optimized(X_train, X_val, y_train, y_val, embedding_type, params, n_epochs=15):
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Extract parameters
    hidden_dim = params['hidden_dim']
    n_layers = params['n_layers']
    dropout = params['dropout'] if n_layers > 1 else 0.0
    learning_rate = params['learning_rate']
    batch_size = params['batch_size']

    # Create datasets and data loaders
    train_dataset = EmbeddingDataset(X_train, y_train)
    val_dataset = EmbeddingDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Create model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y_train))

    model = BiLSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
    model = model.to(device)

    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    best_val_loss = float('inf')
    best_model = None

    for epoch in range(n_epochs):
        # Training
        model.train()
        train_loss = 0
        train_acc = 0

        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(embeddings)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(predictions, 1)
            train_acc += (predicted == labels).sum().item() / len(labels)

        train_loss /= len(train_loader)
        train_acc /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0

        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                predictions = model(embeddings)
                loss = criterion(predictions, labels)

                val_loss += loss.item()
                _, predicted = torch.max(predictions, 1)
                val_acc += (predicted == labels).sum().item() / len(labels)

        val_loss /= len(val_loader)
        val_acc /= len(val_loader)

        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()

    # Load best model
    model.load_state_dict(best_model)

    # Evaluate on validation set
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for embeddings, labels in val_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            predictions = model(embeddings)
            _, predicted = torch.max(predictions, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Optimized BiLSTM with {embedding_type} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

    # Save model
    torch.save(model.state_dict(), os.path.join(models_dir, f'bilstm_{embedding_type.lower()}_optimized.pt'))

    return {
        'model': model,
        'accuracy': accuracy,
        'f1_score': f1,
        'model_name': 'BiLSTM (Optimized)',
        'feature_name': embedding_type
    }

In [None]:
bilstm_result = []

# Best param found by optuna
best_params_bert = {'hidden_dim': 256, 'n_layers': 1, 'learning_rate': 0.0012202088236504737, 'batch_size': 64}

# Train BiLSTM with BERT embeddings
print("\nTraining BiLSTM with BERT embeddings...")
bilstm_bert_result = train_bilstm_optimized(X_bert_train, X_bert_val, y_train, y_val, "BERT", best_params_bert)
bilstm_result.append(bilstm_bert_result)

# Save the results to a file
bilstm_results_file = os.path.join(results_dir, 'bilstm_results.pkl')
with open(bilstm_results_file, 'wb') as f:
  pickle.dump(bilstm_result, f)
print(f"BiLSTM results saved to {bilstm_results_file}")

# Find the best performing model
best_bilstm_model = max(bilstm_result, key=lambda x: x['f1_score'])
best_bilstm_model_file = os.path.join(models_dir, f"best_bilstm_{best_bilstm_model['feature_name'].lower()}.pt")
torch.save(best_bilstm_model['model'].state_dict(), best_bilstm_model_file)
print(f"Best BiLSTM model saved to {best_bilstm_model_file}")

### BERT Fine-tuning with Hyperparameter Optimization

In [None]:
# BERT Dataset class
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        # Convert string labels to numeric using the saved encoder
        numeric_labels = label_encoder.transform(labels)
        self.labels = torch.LongTensor(numeric_labels)

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    total_preds = []
    total_labels = []

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        _, preds = torch.max(outputs.logits, dim=1)
        total_preds.extend(preds.cpu().numpy())
        total_labels.extend(labels.cpu().numpy())

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    f1 = f1_score(total_labels, total_preds, average='weighted')
    accuracy = accuracy_score(total_labels, total_preds)

    return avg_loss, accuracy, f1

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    total_preds = []
    total_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )

            total_loss += outputs.loss.item()
            _, preds = torch.max(outputs.logits, dim=1)
            total_preds.extend(preds.cpu().numpy())
            total_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    f1 = f1_score(total_labels, total_preds, average='weighted')
    accuracy = accuracy_score(total_labels, total_preds)

    return avg_loss, accuracy, f1

In [None]:
from tqdm.auto import tqdm

def optimize_bert(tweets_train, tweets_val, y_train, y_val, num_labels, n_trials=3):
    def objective(trial):
        print(f"\nStarting trial {trial.number + 1}/{n_trials}")

        # Simplified hyperparameter space
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
            'batch_size': trial.suggest_categorical('batch_size', [16, 32]),
            'max_len': 64,  # Fixed to shorter sequence length
            'weight_decay': trial.suggest_float('weight_decay', 1e-4, 1e-2, log=True)
        }

        # Initialize model and tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=num_labels
        )

        # Create datasets using full data
        train_dataset = BERTDataset(tweets_train, y_train, tokenizer, max_len=params['max_len'])
        val_dataset = BERTDataset(tweets_val, y_val, tokenizer, max_len=params['max_len'])

        train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=params['batch_size'])

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        optimizer = AdamW(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])

        best_val_f1 = 0
        epochs = 2  # Reduced number of epochs

        for epoch in range(epochs):
            # Training
            model.train()
            train_progress = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} - Training')
            for batch in train_progress:
                optimizer.zero_grad()
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
                labels = batch['label'].to(device)

                outputs = model(**inputs, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

                train_progress.set_postfix({'loss': f'{loss.item():.4f}'})

            # Validation
            model.eval()
            val_preds = []
            val_labels = []

            with torch.no_grad():
                for batch in tqdm(val_loader, desc='Validation'):
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
                    labels = batch['label'].to(device)
                    outputs = model(**inputs)
                    _, preds = torch.max(outputs.logits, dim=1)
                    val_preds.extend(preds.cpu().numpy())
                    val_labels.extend(labels.cpu().numpy())

            val_f1 = f1_score(val_labels, val_preds, average='weighted')
            print(f'Epoch {epoch+1}/{epochs} - Validation F1: {val_f1:.4f}')

            if val_f1 > best_val_f1:
                best_val_f1 = val_f1

            # Early pruning
            trial.report(val_f1, epoch)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        return best_val_f1

    # Create study with aggressive pruning
    study = optuna.create_study(
        direction='maximize',
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=1,
            n_warmup_steps=5,
            interval_steps=1
        )
    )

    study.optimize(objective, n_trials=n_trials)

    print("\nOptimization completed!")
    print(f"Best trial F1 score: {study.best_trial.value:.4f}")
    print("Best parameters:", study.best_params)

    return study.best_params

In [None]:
def train_bert(tweets_train, tweets_val, y_train, y_val, params, num_labels):
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    )

    # Create datasets
    train_dataset = BERTDataset(tweets_train, y_train, tokenizer)
    val_dataset = BERTDataset(tweets_val, y_val, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'])

    # Setup training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])

    # Training loop
    best_val_f1 = 0
    best_model = None
    epochs = 3  # Reduced number of epochs for faster training

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')

        # Train
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []

        train_progress = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs} - Training')
        for batch in train_progress:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )

            loss = outputs.loss
            train_loss += loss.item()
            loss.backward()
            optimizer.step()

            _, preds = torch.max(outputs.logits, dim=1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

            train_progress.set_postfix({'loss': f'{loss.item():.4f}'})

        train_loss /= len(train_loader)
        train_f1 = f1_score(train_labels, train_preds, average='weighted')
        train_acc = accuracy_score(train_labels, train_preds)
        print(f'Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}')

        # Validate
        val_loss, val_acc, val_f1 = evaluate(model, val_loader, device)
        print(f'Val Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}')

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model = model.state_dict().copy()

    # Save best model
    model.load_state_dict(best_model)
    torch.save(model.state_dict(), os.path.join(models_dir, 'best_bert.pt'))

    return {
        'model': model,
        'accuracy': val_acc,
        'f1_score': best_val_f1,
        'model_name': 'BERT',
        'feature_name': 'Fine-tuned'
    }

In [None]:
bert_result = []

# Run BERT optimization and training
print("Starting BERT hyperparameter optimization...")
best_params = optimize_bert(train_df['cleaned_content'], val_df['cleaned_content'], y_train, y_val, len(np.unique(y_train)), n_trials=1)
print("Best parameters:", best_params)

# Best param found
best_params_bert = {
    'learning_rate': 2.081063824861354e-05,
    'batch_size': 32,
    'weight_decay': 0.0019994876077473584
}

print("\nTraining BERT with optimized parameters...")
bert_results = train_bert(train_df['cleaned_content'], val_df['cleaned_content'], y_train, y_val, best_params_bert, len(np.unique(y_train)))
bert_result.append(bert_results)

print(f"\nBERT Training Complete - Validation F1: {bert_results['f1_score']:.4f}")

# Save the results to a file
bert_results_file = os.path.join(results_dir, 'bert_results.pkl')
with open(bert_results_file, 'wb') as f:
  pickle.dump(bert_result, f)
print(f"BERT results saved to {bert_results_file}")

# Find the best performing model
best_bert_model = max(bert_result, key=lambda x: x['f1_score'])
best_bert_model_file = os.path.join(models_dir, f"best_bert_{best_bert_model['feature_name'].lower()}.pt")
torch.save(best_bert_model['model'].state_dict(), best_bert_model_file)
print(f"Best BERT model saved to {best_bert_model_file}")

## Compare Model Performance

In [None]:
# Load results from saved files
with open(os.path.join(results_dir, 'logistic_regression_results.pkl'), 'rb') as f:
    lr_results = pickle.load(f)

with open(os.path.join(results_dir, 'svm_results.pkl'), 'rb') as f:
    svm_results = pickle.load(f)

with open(os.path.join(results_dir, 'rf_results.pkl'), 'rb') as f:
    rf_results = pickle.load(f)

with open(os.path.join(results_dir, 'bilstm_results.pkl'), 'rb') as f:
    bilstm_results = pickle.load(f)

with open(os.path.join(results_dir, 'bert_results.pkl'), 'rb') as f:
    bert_results = pickle.load(f)

# Combine all results into a single list
results = lr_results + rf_results + bilstm_results + bert_results
results_df = pd.DataFrame([
    {'Model': r['model_name'],
     'Feature': r['feature_name'],
     'Accuracy': r['accuracy'],
     'F1 Score': r['f1_score']} for r in results
])

# Sort by F1 score (descending)
results_df = results_df.sort_values(by='F1 Score', ascending=False).reset_index(drop=True)

# Display results
print("Model Performance on Validation Set:")
results_df

In [None]:
# Visualize results
plt.figure(figsize=(14, 8))
sns.barplot(x='Model', y='F1 Score', hue='Feature', data=results_df)
plt.title('Model Performance Comparison (F1 Score)', fontsize=15)
plt.xlabel('Model', fontsize=12)
plt.ylabel('F1 Score', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(results_dir, 'model_comparison.png'))
plt.show()

## 7. Save Best Models

In [None]:
# Get top 3 performing models
top_models = results_df.head(3)
print("Top 3 performing models:")
top_models

In [None]:
# Save results
results_df.to_csv(os.path.join(results_dir, 'model_comparison_results.csv'), index=False)
print(f"Model comparison results saved to {os.path.join(results_dir, 'model_comparison_results.csv')}")