# Determining Sentence Sentiment using NLP

In [None]:
import torch
import pandas as pd

# My libraries
from src import Trainer, SSTDataPipeline, Plotter, DynamicRNN

# Check Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running experiments on hardware type: {DEVICE}")

In [None]:
BATCH_SIZE = 64
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 5
DROPOUT = 0.5
EPOCHS = 15
LEARNING_RATE = 0.001

# Pre-trained Vector Path
VECTOR_PATH = "./data/vector.txt"

In [None]:
print("Initializing Data Pipeline...")
data_pipe = SSTDataPipeline(
    vector_path=VECTOR_PATH, batch_size=BATCH_SIZE, device=DEVICE
)
train_iter, val_iter, test_iter = data_pipe.run()

VOCAB_SIZE = data_pipe.vocab_size
PAD_IDX = data_pipe.get_pad_idx()
PRETRAINED_VECTORS = data_pipe.get_embeddings()

print(f"Data ready. Vocab Size: {VOCAB_SIZE}, Pad Index: {PAD_IDX}")

### Part 1: Find the best model

In [None]:
experiments_config = [
    # --- Naive RNN ---
    {
        "name": "1_RNN_Random",
        "rnn_type": "rnn",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": False,
    },
    {
        "name": "2_RNN_Pretrained",
        "rnn_type": "rnn",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": True,
    },
    # --- Naive LSTM ---
    {
        "name": "3_LSTM_Random",
        "rnn_type": "lstm",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": False,
    },
    {
        "name": "4_LSTM_Pretrained",
        "rnn_type": "lstm",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": True,
    },
    # --- Naive GRU ---
    {
        "name": "5_GRU_Random",
        "rnn_type": "gru",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": False,
    },
    {
        "name": "6_GRU_Pretrained",
        "rnn_type": "gru",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": True,
    },
    # --- Better LSTM (Bi-Directional, 2 Layers) ---
    {
        "name": "7_BiLSTM_Deep_Random",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": False,
    },
    {
        "name": "8_BiLSTM_Deep_Pretrained",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
    },
    # --- Better GRU (Bi-Directional, 2 Layers) ---
    {
        "name": "9_BiGRU_Deep_Random",
        "rnn_type": "gru",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": False,
    },
    {
        "name": "10_BiGRU_Deep_Pretrained",
        "rnn_type": "gru",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
    },
]

In [None]:
# Store all results here
all_histories = {}  # Key: Model Name, Value: History Dict
all_summaries = []  # List of Summary Dicts for DataFrame

for config in experiments_config:
    print(f"\n{'='*20} Running: {config['name']} {'='*20}")

    # 1. Init Model
    model = DynamicRNN(
        vocab_size=VOCAB_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=OUTPUT_DIM,
        n_layers=config["n_layers"],
        bidirectional=config["bidirectional"],
        dropout=DROPOUT,
        pad_idx=PAD_IDX,
        rnn_type=config["rnn_type"],
    )

    # 2. Embeddings
    if config["use_pretrained"]:
        if PRETRAINED_VECTORS is not None:
            print(f"Loading vectors...")
            model.embedding.weight.data.copy_(PRETRAINED_VECTORS)
        else:
            print("Vectors not found, using random.")

    # 3. Hyperparams for record keeping
    hyperparams = {
        "Type": config["rnn_type"].upper(),
        "BiDir": config["bidirectional"],
        "Embeds": "Pretrained" if config["use_pretrained"] else "Random",
        "Layers": config["n_layers"],
    }

    # 4. Train
    trainer = Trainer(model, DEVICE)
    history, summary = trainer.run_experiment(
        train_iter,
        val_iter,
        epochs=EPOCHS,
        lr=LEARNING_RATE,
        name=config["name"],
        hyperparameters=hyperparams,
        save_weights=True,
    )

    # 5. Store Data
    all_histories[config["name"]] = history
    all_summaries.append(summary)

print("\nAll models trained successfully.")

In [None]:
print(f"Generating plots for {len(all_histories)} models...\n")

for model_name, history in all_histories.items():
    Plotter.plot_history(history, title=model_name)

In [None]:
df_results = pd.DataFrame(all_summaries)

cols = [
    "Model",
    "Type",
    "BiDir",
    "Embeds",
    "Best Val Acc",
    "Best Val Loss",
    "Time (s)",
    "Parameters",
]
cols = [c for c in cols if c in df_results.columns]
df_results = df_results[cols + [c for c in df_results.columns if c not in cols]]

display(df_results.sort_values(by="Best Val Acc", ascending=False))

csv_path = "sst_experiment_results.csv"
df_results.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")

## How can we improve our model?

In [None]:
phase3_experiments = [
    # The winner from Phase 2 (Frozen, No Attention)
    {
        "name": "Baseline_Winner",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": False,
        "freeze_embeddings": True,
    },
    # Optimization 1: Unfreeze Embeddings (Fine-Tuning)
    {
        "name": "FineTuned_Embeddings",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": False,
        "freeze_embeddings": False,  # <--- Unlocked
    },
    # Optimization 2: Add Attention (with frozen embeddings)
    {
        "name": "Attention_Frozen",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": True,
        "freeze_embeddings": True,  # <--- Attention ON
    },
    # Optimization 3: The "Kitchen Sink" (Attention + Fine-Tuning)
    {
        "name": "Best_Model_Full_Opt",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": True,
        "freeze_embeddings": False,  # <--- Both ON
    },
]

In [None]:
all_histories = {}  # Key: Model Name, Value: History Dict
all_summaries = []  # List of Summary Dicts for DataFrame

for config in experiments_config:
    print(f"\n{'='*20} Running: {config['name']} {'='*20}")

    # 1. Init Model
    # Note: We strictly force attention=False and freeze=True for valid Phase 1/2 comparison
    model = DynamicRNN(
        vocab_size=VOCAB_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=OUTPUT_DIM,
        n_layers=config["n_layers"],
        bidirectional=config["bidirectional"],
        dropout=DROPOUT,
        pad_idx=PAD_IDX,
        rnn_type=config["rnn_type"],
        use_attention=False,  # Optimization reserved for Phase 3
        freeze_embeddings=True,  # We want to compare Random vs Static Pretrained first
    )

    # 2. Embeddings
    if config["use_pretrained"]:
        if PRETRAINED_VECTORS is not None:
            print(f"Loading vectors...")
            model.embedding.weight.data.copy_(PRETRAINED_VECTORS)
        else:
            print("Vectors not found, using random.")

    # 3. Hyperparams for record keeping
    hyperparams = {
        "Type": config["rnn_type"].upper(),
        "BiDir": config["bidirectional"],
        "Embeds": "Pretrained" if config["use_pretrained"] else "Random",
        "Layers": config["n_layers"],
    }

    # 4. Train
    trainer = Trainer(model, DEVICE)
    history, summary = trainer.run_experiment(
        train_iter,
        val_iter,
        epochs=EPOCHS,
        lr=LEARNING_RATE,
        name=config["name"],
        hyperparameters=hyperparams,
        save_weights=True,
    )

    # 5. Store Data
    all_histories[config["name"]] = history
    all_summaries.append(summary)

print("\nAll models trained successfully.")

In [None]:
# --- Cell: Visualize and Report ---
print(f"Generating plots for {len(all_histories)} models...\n")

# You can adjust grid size or plotting logic here
for model_name, history in all_histories.items():
    Plotter.plot_history(history, title=model_name)

In [None]:
# Create and Display Dataframe
df_results = pd.DataFrame(all_summaries)

# Reorder columns for readability
cols = [
    "Model",
    "Type",
    "BiDir",
    "Embeds",
    "Best Val Acc",
    "Best Val Loss",
    "Time (s)",
    "Parameters",
]
# Filter to ensure columns exist before selecting
cols = [c for c in cols if c in df_results.columns]
df_results = df_results[cols + [c for c in df_results.columns if c not in cols]]

# Display Leaderboard
print("\n--- Experiment Leaderboard (Sorted by Validation Accuracy) ---")
display(df_results.sort_values(by="Best Val Acc", ascending=False))

# Export
csv_path = "sst_phase1_2_results.csv"
df_results.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")