# Twitter Sentiment Analysis - Model Training & Hyperparameter Tuning

This notebook focuses on training different models for sentiment classification:

1. Baseline Models:
   - Logistic Regression
   - Support Vector Machine (SVM)
   - Random Forest

2. Deep Learning Models:
   - BiLSTM
   - BERT fine-tuning

3. Hyperparameter Tuning:
   - Grid Search for baseline models
   - Optuna for deep learning models

We will train each model with different feature representations and save the best models.

## 1. Setup and Imports

In [0]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
import scipy.sparse as sp

# Deep learning libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Hyperparameter optimization
import optuna

# Visualization settings
plt.style.use('ggplot')
sns.set(style='whitegrid')
%matplotlib inline

## 2. Load Features and Prepare Data

In [0]:
# For Google Colab, uncomment these lines to mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# features_dir = '/content/drive/MyDrive/path/to/features'
# models_dir = '/content/drive/MyDrive/path/to/models'

# For local development
features_dir = '../data/features'
models_dir = '../models'
results_dir = '../results'
os.makedirs(results_dir, exist_ok=True)

# Load labels
y = np.load(os.path.join(features_dir, 'labels.npy'))

# Load features
X_bow = sp.load_npz(os.path.join(features_dir, 'bow_features.npz'))
X_tfidf = sp.load_npz(os.path.join(features_dir, 'tfidf_features.npz'))
X_word2vec = np.load(os.path.join(features_dir, 'word2vec_features.npy'))
X_glove = np.load(os.path.join(features_dir, 'glove_features.npy'))
X_bert = np.load(os.path.join(features_dir, 'bert_features.npy'))

print("Features loaded successfully.")

## 3. Data Splitting

In [0]:
# Split data into train, validation, and test sets (70%, 15%, 15%)
def split_data(X, y, test_size=0.15, val_size=0.15, random_state=42):
    # First split: training + validation vs test
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Second split: training vs validation
    # Adjust validation size to be a percentage of the training + validation set
    val_ratio = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_ratio, random_state=random_state, stratify=y_train_val
    )
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Create train/val/test splits for each feature type
# BoW features
bow_train, bow_val, bow_test, y_train, y_val, y_test = split_data(X_bow, y)

# TF-IDF features
tfidf_train, tfidf_val, tfidf_test, _, _, _ = split_data(X_tfidf, y, random_state=42)

# Word2Vec features
w2v_train, w2v_val, w2v_test, _, _, _ = split_data(X_word2vec, y, random_state=42)

# GloVe features
glove_train, glove_val, glove_test, _, _, _ = split_data(X_glove, y, random_state=42)

# BERT features
bert_train, bert_val, bert_test, _, _, _ = split_data(X_bert, y, random_state=42)

print(f"Train set: {y_train.shape[0]} samples")
print(f"Validation set: {y_val.shape[0]} samples")
print(f"Test set: {y_test.shape[0]} samples")

## 4. Baseline Models

In [0]:
# Train and evaluate a model
def train_evaluate_model(model, X_train, X_val, y_train, y_val, model_name, feature_name):
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    # Print results
    print(f"{model_name} with {feature_name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
    
    return {
        'model': model,
        'accuracy': accuracy,
        'f1_score': f1,
        'model_name': model_name,
        'feature_name': feature_name
    }

# Function to run GridSearchCV
def grid_search_model(model, param_grid, X_train, y_train, model_name):
    # Set up GridSearchCV
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1, verbose=1)
    
    # Train model with grid search
    grid_search.fit(X_train, y_train)
    
    # Print results
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

In [0]:
# Dictionary to store all results
results = []

### 4.1 Logistic Regression

In [0]:
# Logistic Regression parameters
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

# For BoW features
print("Training Logistic Regression with BoW features...")
lr_bow = grid_search_model(LogisticRegression(), lr_param_grid, bow_train, y_train, "Logistic Regression (BoW)")
lr_bow_result = train_evaluate_model(lr_bow, bow_train, bow_val, y_train, y_val, "Logistic Regression", "BoW")
results.append(lr_bow_result)

# For TF-IDF features
print("\nTraining Logistic Regression with TF-IDF features...")
lr_tfidf = grid_search_model(LogisticRegression(), lr_param_grid, tfidf_train, y_train, "Logistic Regression (TF-IDF)")
lr_tfidf_result = train_evaluate_model(lr_tfidf, tfidf_train, tfidf_val, y_train, y_val, "Logistic Regression", "TF-IDF")
results.append(lr_tfidf_result)

# For Word2Vec features
print("\nTraining Logistic Regression with Word2Vec features...")
lr_w2v = grid_search_model(LogisticRegression(), lr_param_grid, w2v_train, y_train, "Logistic Regression (Word2Vec)")
lr_w2v_result = train_evaluate_model(lr_w2v, w2v_train, w2v_val, y_train, y_val, "Logistic Regression", "Word2Vec")
results.append(lr_w2v_result)

# For GloVe features
print("\nTraining Logistic Regression with GloVe features...")
lr_glove = grid_search_model(LogisticRegression(), lr_param_grid, glove_train, y_train, "Logistic Regression (GloVe)")
lr_glove_result = train_evaluate_model(lr_glove, glove_train, glove_val, y_train, y_val, "Logistic Regression", "GloVe")
results.append(lr_glove_result)

# For BERT features
print("\nTraining Logistic Regression with BERT features...")
lr_bert = grid_search_model(LogisticRegression(), lr_param_grid, bert_train, y_train, "Logistic Regression (BERT)")
lr_bert_result = train_evaluate_model(lr_bert, bert_train, bert_val, y_train, y_val, "Logistic Regression", "BERT")
results.append(lr_bert_result)

### 4.2 Support Vector Machine (SVM)

In [0]:
# SVM parameters - limited for computational efficiency
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# For BoW features
print("Training SVM with BoW features...")
svm_bow = grid_search_model(SVC(), svm_param_grid, bow_train, y_train, "SVM (BoW)")
svm_bow_result = train_evaluate_model(svm_bow, bow_train, bow_val, y_train, y_val, "SVM", "BoW")
results.append(svm_bow_result)

# For TF-IDF features
print("\nTraining SVM with TF-IDF features...")
svm_tfidf = grid_search_model(SVC(), svm_param_grid, tfidf_train, y_train, "SVM (TF-IDF)")
svm_tfidf_result = train_evaluate_model(svm_tfidf, tfidf_train, tfidf_val, y_train, y_val, "SVM", "TF-IDF")
results.append(svm_tfidf_result)

# For Word2Vec features
print("\nTraining SVM with Word2Vec features...")
svm_w2v = grid_search_model(SVC(), svm_param_grid, w2v_train, y_train, "SVM (Word2Vec)")
svm_w2v_result = train_evaluate_model(svm_w2v, w2v_train, w2v_val, y_train, y_val, "SVM", "Word2Vec")
results.append(svm_w2v_result)

# For GloVe features
print("\nTraining SVM with GloVe features...")
svm_glove = grid_search_model(SVC(), svm_param_grid, glove_train, y_train, "SVM (GloVe)")
svm_glove_result = train_evaluate_model(svm_glove, glove_train, glove_val, y_train, y_val, "SVM", "GloVe")
results.append(svm_glove_result)

# For BERT features
print("\nTraining SVM with BERT features...")
svm_bert = grid_search_model(SVC(), svm_param_grid, bert_train, y_train, "SVM (BERT)")
svm_bert_result = train_evaluate_model(svm_bert, bert_train, bert_val, y_train, y_val, "SVM", "BERT")
results.append(svm_bert_result)

### 4.3 Random Forest

In [0]:
# Random Forest parameters
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# For BoW features
print("Training Random Forest with BoW features...")
rf_bow = grid_search_model(RandomForestClassifier(random_state=42), rf_param_grid, bow_train, y_train, "Random Forest (BoW)")
rf_bow_result = train_evaluate_model(rf_bow, bow_train, bow_val, y_train, y_val, "Random Forest", "BoW")
results.append(rf_bow_result)

# For TF-IDF features
print("\nTraining Random Forest with TF-IDF features...")
rf_tfidf = grid_search_model(RandomForestClassifier(random_state=42), rf_param_grid, tfidf_train, y_train, "Random Forest (TF-IDF)")
rf_tfidf_result = train_evaluate_model(rf_tfidf, tfidf_train, tfidf_val, y_train, y_val, "Random Forest", "TF-IDF")
results.append(rf_tfidf_result)

# For Word2Vec features
print("\nTraining Random Forest with Word2Vec features...")
rf_w2v = grid_search_model(RandomForestClassifier(random_state=42), rf_param_grid, w2v_train, y_train, "Random Forest (Word2Vec)")
rf_w2v_result = train_evaluate_model(rf_w2v, w2v_train, w2v_val, y_train, y_val, "Random Forest", "Word2Vec")
results.append(rf_w2v_result)

# For GloVe features
print("\nTraining Random Forest with GloVe features...")
rf_glove = grid_search_model(RandomForestClassifier(random_state=42), rf_param_grid, glove_train, y_train, "Random Forest (GloVe)")
rf_glove_result = train_evaluate_model(rf_glove, glove_train, glove_val, y_train, y_val, "Random Forest", "GloVe")
results.append(rf_glove_result)

# For BERT features
print("\nTraining Random Forest with BERT features...")
rf_bert = grid_search_model(RandomForestClassifier(random_state=42), rf_param_grid, bert_train, y_train, "Random Forest (BERT)")
rf_bert_result = train_evaluate_model(rf_bert, bert_train, bert_val, y_train, y_val, "Random Forest", "BERT")
results.append(rf_bert_result)

## 5. Deep Learning Models

### 5.1 BiLSTM Model

In [0]:
# Define BiLSTM model
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, text):
        # text shape: [batch size, input dim]
        # We need to add sequence length dimension for LSTM
        text = text.unsqueeze(1)  # Now: [batch size, 1, input dim]
        
        output, (hidden, cell) = self.lstm(text)
        
        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        
        return self.fc(hidden)

# Dataset class for PyTorch
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.FloatTensor(embeddings)
        self.labels = torch.LongTensor(labels)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Function to train BiLSTM
def train_bilstm(X_train, X_val, y_train, y_val, embedding_type, n_epochs=10):
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Create datasets and data loaders
    train_dataset = EmbeddingDataset(X_train, y_train)
    val_dataset = EmbeddingDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    
    # Create model
    input_dim = X_train.shape[1]
    hidden_dim = 128
    output_dim = len(np.unique(y_train))
    n_layers = 2
    dropout = 0.5
    
    model = BiLSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
    model = model.to(device)
    
    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(n_epochs):
        # Training
        model.train()
        train_loss = 0
        train_acc = 0
        
        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(embeddings)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(predictions, 1)
            train_acc += (predicted == labels).sum().item() / len(labels)
        
        train_loss /= len(train_loader)
        train_acc /= len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0
        
        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                predictions = model(embeddings)
                loss = criterion(predictions, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(predictions, 1)
                val_acc += (predicted == labels).sum().item() / len(labels)
        
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)
        
        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
    
    # Load best model
    model.load_state_dict(best_model)
    
    # Evaluate on validation set
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for embeddings, labels in val_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            predictions = model(embeddings)
            _, predicted = torch.max(predictions, 1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"BiLSTM with {embedding_type} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
    
    # Save model
    torch.save(model.state_dict(), os.path.join(models_dir, f'bilstm_{embedding_type.lower()}.pt'))
    
    return {
        'model': model,
        'accuracy': accuracy,
        'f1_score': f1,
        'model_name': 'BiLSTM',
        'feature_name': embedding_type
    }

In [0]:
# Train BiLSTM with Word2Vec embeddings
print("Training BiLSTM with Word2Vec embeddings...")
bilstm_w2v_result = train_bilstm(w2v_train, w2v_val, y_train, y_val, "Word2Vec")
results.append(bilstm_w2v_result)

# Train BiLSTM with GloVe embeddings
print("\nTraining BiLSTM with GloVe embeddings...")
bilstm_glove_result = train_bilstm(glove_train, glove_val, y_train, y_val, "GloVe")
results.append(bilstm_glove_result)

# Train BiLSTM with BERT embeddings
print("\nTraining BiLSTM with BERT embeddings...")
bilstm_bert_result = train_bilstm(bert_train, bert_val, y_train, y_val, "BERT")
results.append(bilstm_bert_result)

### 5.2 BERT Fine-tuning

### 5.3 Hyperparameter Tuning with Optuna for BiLSTM

In [0]:
# Hyperparameter tuning for BiLSTM using Optuna
def objective(trial, X_train, X_val, y_train, y_val):
    # Define hyperparameters to optimize
    hidden_dim = trial.suggest_int('hidden_dim', 64, 256, step=64)
    n_layers = trial.suggest_int('n_layers', 1, 3)
    dropout = trial.suggest_float('dropout', 0.1, 0.5, step=0.1)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Create datasets and data loaders
    train_dataset = EmbeddingDataset(X_train, y_train)
    val_dataset = EmbeddingDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Create model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y_train))
    
    model = BiLSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
    model = model.to(device)
    
    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    # Early stopping parameters
    patience = 3
    best_val_loss = float('inf')
    counter = 0
    
    # Training loop
    n_epochs = 10  # Max epochs for tuning
    
    for epoch in range(n_epochs):
        # Training
        model.train()
        train_loss = 0
        
        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(embeddings)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                predictions = model(embeddings)
                loss = criterion(predictions, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(predictions, 1)
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        val_loss /= len(val_loader)
        f1 = f1_score(all_labels, all_preds, average='weighted')
        
        # Report score to Optuna
        trial.report(f1, epoch)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                break
        
        # Handle pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return f1

# Run optimization for BiLSTM with Word2Vec embeddings
def optimize_bilstm(X_train, X_val, y_train, y_val, embedding_type, n_trials=20):
    print(f"\nOptimizing BiLSTM with {embedding_type} embeddings...")
    
    # Create study
    study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
    
    # Run optimization
    study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val), n_trials=n_trials)
    
    # Print results
    print(f"\nBest trial for BiLSTM with {embedding_type}:")
    trial = study.best_trial
    print(f"  F1 Score: {trial.value:.4f}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    return study.best_params

In [0]:
# Uncomment to run hyperparameter optimization (this can take a long time)
# best_params_w2v = optimize_bilstm(w2v_train, w2v_val, y_train, y_val, "Word2Vec", n_trials=10)
# best_params_glove = optimize_bilstm(glove_train, glove_val, y_train, y_val, "GloVe", n_trials=10)
# best_params_bert = optimize_bilstm(bert_train, bert_val, y_train, y_val, "BERT", n_trials=10)

In [0]:
# Function to train BiLSTM with optimized hyperparameters
def train_bilstm_optimized(X_train, X_val, y_train, y_val, embedding_type, params, n_epochs=15):
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Extract parameters
    hidden_dim = params['hidden_dim']
    n_layers = params['n_layers']
    dropout = params['dropout']
    learning_rate = params['learning_rate']
    batch_size = params['batch_size']
    
    # Create datasets and data loaders
    train_dataset = EmbeddingDataset(X_train, y_train)
    val_dataset = EmbeddingDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Create model
    input_dim = X_train.shape[1]
    output_dim = len(np.unique(y_train))
    
    model = BiLSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
    model = model.to(device)
    
    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(n_epochs):
        # Training
        model.train()
        train_loss = 0
        train_acc = 0
        
        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(embeddings)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(predictions, 1)
            train_acc += (predicted == labels).sum().item() / len(labels)
        
        train_loss /= len(train_loader)
        train_acc /= len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0
        
        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                predictions = model(embeddings)
                loss = criterion(predictions, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(predictions, 1)
                val_acc += (predicted == labels).sum().item() / len(labels)
        
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)
        
        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
    
    # Load best model
    model.load_state_dict(best_model)
    
    # Evaluate on validation set
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for embeddings, labels in val_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            predictions = model(embeddings)
            _, predicted = torch.max(predictions, 1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"Optimized BiLSTM with {embedding_type} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
    
    # Save model
    torch.save(model.state_dict(), os.path.join(models_dir, f'bilstm_{embedding_type.lower()}_optimized.pt'))
    
    return {
        'model': model,
        'accuracy': accuracy,
        'f1_score': f1,
        'model_name': 'BiLSTM (Optimized)',
        'feature_name': embedding_type
    }

### 5.4 BERT Fine-tuning with Hyperparameter Optimization

In [0]:
# BERT Dataset class
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [0]:
# Placeholder for BERT fine-tuning with Optuna hyperparameter optimization
# This would involve:
# 1. Loading the processed tweets text from cleaned data
# 2. Setting up BERT tokenizer and dataset
# 3. Defining the objective function for Optuna (optimizing learning rate, batch size, etc.)
# 4. Training BERT with the optimized hyperparameters

# For demonstration, we'll include a simplified version
def objective_bert(trial, texts, labels, num_labels):
    # Define hyperparameters to optimize
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    epochs = trial.suggest_int('epochs', 2, 4)  # Limited for computation time
    
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    )
    
    # Split data (assuming texts and labels are already processed)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    # Create datasets
    train_dataset = BERTDataset(train_texts, train_labels, tokenizer)
    val_dataset = BERTDataset(val_texts, val_labels, tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Setup device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # Setup optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    # Training loop
    best_val_f1 = 0
    
    for epoch in range(epochs):
        # Training
        model.train()
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)
                
                _, preds = torch.max(outputs.logits, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        # Calculate F1 score
        val_f1 = f1_score(val_true, val_preds, average='weighted')
        
        # Report to Optuna
        trial.report(val_f1, epoch)
        
        # Handle pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        # Update best score
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
    
    return best_val_f1

# Placeholder for BERT optimization function
# In practice, this would load your Twitter dataset text and run the optimization
def optimize_bert(n_trials=10):
    print("Note: For a real implementation, load your processed tweet texts here.")
    print("This is a placeholder to show the structure of BERT hyperparameter tuning.")
    
    # In practice, you would:
    # 1. Load your text data and labels
    # 2. Define num_labels based on your sentiment classes
    # 3. Run the study as shown below
    
    # Create study
    # study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
    # study.optimize(lambda trial: objective_bert(trial, texts, labels, num_labels), n_trials=n_trials)
    
    # Return example best parameters
    return {
        'learning_rate': 2e-5,
        'batch_size': 16,
        'epochs': 3
    }

In [0]:
# Uncomment to run BERT hyperparameter optimization (requires original tweet texts)
# best_params_bert = optimize_bert(n_trials=5)

In [0]:
# This section will be a simplified version of BERT fine-tuning
# For the full implementation, please see the complete notebook

## 6. Compare Model Performance

In [0]:
# Create results DataFrame
results_df = pd.DataFrame([
    {'Model': r['model_name'], 
     'Feature': r['feature_name'], 
     'Accuracy': r['accuracy'], 
     'F1 Score': r['f1_score']} for r in results
])

# Sort by F1 score (descending)
results_df = results_df.sort_values(by='F1 Score', ascending=False).reset_index(drop=True)

# Display results
print("Model Performance on Validation Set:")
results_df

In [0]:
# Visualize results
plt.figure(figsize=(14, 8))
sns.barplot(x='Model', y='F1 Score', hue='Feature', data=results_df)
plt.title('Model Performance Comparison (F1 Score)', fontsize=15)
plt.xlabel('Model', fontsize=12)
plt.ylabel('F1 Score', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(results_dir, 'model_comparison.png'))
plt.show()

## 7. Save Best Models

In [0]:
# Get top 3 performing models
top_models = results_df.head(3)
print("Top 3 performing models:")
top_models

In [0]:
# Save results
results_df.to_csv(os.path.join(results_dir, 'model_comparison_results.csv'), index=False)
print(f"Model comparison results saved to {os.path.join(results_dir, 'model_comparison_results.csv')}")

## 8. Summary and Next Steps

In this notebook, we have:
1. Loaded different feature representations of the Twitter dataset
2. Split the data into train, validation, and test sets
3. Trained baseline models (Logistic Regression, SVM, Random Forest) with Grid Search
4. Trained deep learning models (BiLSTM) with different embeddings
5. Compared model performance across different feature representations
6. Saved the best models for further evaluation

**Next Steps:**
- Move to the evaluation notebook
- Evaluate the best models on the test set
- Apply interpretability techniques (SHAP values)
- Generate comprehensive evaluation reports