# Determining Sentence Sentiment using NLP

In [1]:
import torch
import pandas as pd

# My libraries
from src import Trainer, Plotter, DynamicRNN
from src import get_sst_data_loaders

# Check Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running experiments on hardware type: {DEVICE}")

Running experiments on hardware type: cpu


In [2]:
BATCH_SIZE = 64
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 5
DROPOUT = 0.5
EPOCHS = 15
LEARNING_RATE = 0.001

# Pre-trained Vector Path
VECTOR_PATH = "./data/vector.txt"

In [3]:
print("Initializing Data Pipeline...")

# 1. Call the new loader function
train_iter, val_iter, test_iter, TEXT_FIELD, LABEL_FIELD = get_sst_data_loaders(
    batch_size=BATCH_SIZE, vector_path=VECTOR_PATH, device=DEVICE
)

# 2. Extract Metadata from the returned Fields
VOCAB_SIZE = len(TEXT_FIELD.vocab)

# --- FIX: Access pad_token from the vocab object, not the field object ---
PAD_IDX = TEXT_FIELD.vocab.stoi.get(TEXT_FIELD.vocab.pad_token)

PRETRAINED_VECTORS = TEXT_FIELD.vocab.vectors

print(f"Data ready. Vocab Size: {VOCAB_SIZE}, Pad Index: {PAD_IDX}")
if PRETRAINED_VECTORS is not None:
    print(f"Embeddings shape: {PRETRAINED_VECTORS.shape}")

Initializing Data Pipeline...
--> Initializing Fields...
--> Loading SST Splits...
    Training samples: 8544
--> Building Vocabulary...
    Vocab Size: 18280
    Label Map: {'very negative': 1, 'negative': 2, 'neutral': 3, 'positive': 4, 'very positive': 5}
--> Building Iterators...
Data ready. Vocab Size: 18280, Pad Index: 1
Embeddings shape: torch.Size([18280, 300])


### Part 1: Find the best model

In [4]:
experiments_config = [
    # --- Naive RNN ---
    {
        "name": "1_RNN_Random",
        "rnn_type": "rnn",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": False,
    },
    {
        "name": "2_RNN_Pretrained",
        "rnn_type": "rnn",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": True,
    },
    # --- Naive LSTM ---
    {
        "name": "3_LSTM_Random",
        "rnn_type": "lstm",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": False,
    },
    {
        "name": "4_LSTM_Pretrained",
        "rnn_type": "lstm",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": True,
    },
    # --- Naive GRU ---
    {
        "name": "5_GRU_Random",
        "rnn_type": "gru",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": False,
    },
    {
        "name": "6_GRU_Pretrained",
        "rnn_type": "gru",
        "n_layers": 1,
        "bidirectional": False,
        "use_pretrained": True,
    },
    # --- Better LSTM (Bi-Directional, 2 Layers) ---
    {
        "name": "7_BiLSTM_Deep_Random",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": False,
    },
    {
        "name": "8_BiLSTM_Deep_Pretrained",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
    },
    # --- Better GRU (Bi-Directional, 2 Layers) ---
    {
        "name": "9_BiGRU_Deep_Random",
        "rnn_type": "gru",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": False,
    },
    {
        "name": "10_BiGRU_Deep_Pretrained",
        "rnn_type": "gru",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
    },
]

In [5]:
# Store all results here
all_histories = {}  # Key: Model Name, Value: History Dict
all_summaries = []  # List of Summary Dicts for DataFrame

for config in experiments_config:
    print(f"\n{'='*20} Running: {config['name']} {'='*20}")

    # 1. Init Model
    model = DynamicRNN(
        vocab_size=VOCAB_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=OUTPUT_DIM,
        n_layers=config["n_layers"],
        bidirectional=config["bidirectional"],
        dropout=DROPOUT,
        pad_idx=PAD_IDX,
        rnn_type=config["rnn_type"],
    )

    # 2. Embeddings
    if config["use_pretrained"]:
        if PRETRAINED_VECTORS is not None:
            print(f"Loading vectors...")
            model.embedding.weight.data.copy_(PRETRAINED_VECTORS)
        else:
            print("Vectors not found, using random.")

    # 3. Hyperparams for record keeping
    hyperparams = {
        "Type": config["rnn_type"].upper(),
        "BiDir": config["bidirectional"],
        "Embeds": "Pretrained" if config["use_pretrained"] else "Random",
        "Layers": config["n_layers"],
    }

    # 4. Train
    trainer = Trainer(model, DEVICE)
    history, summary = trainer.run_experiment(
        train_iter,
        val_iter,
        epochs=EPOCHS,
        lr=LEARNING_RATE,
        name=config["name"],
        hyperparameters=hyperparams,
        save_weights=True,
    )

    # 5. Store Data
    all_histories[config["name"]] = history
    all_summaries.append(summary)

print("\nAll models trained successfully.")


Starting 1_RNN_Random | Params: 5,628,133
  Epoch: 01 | Time: 0m 11s | Train Acc: 0.246 | Val Acc: 0.289
  Epoch: 02 | Time: 0m 9s | Train Acc: 0.269 | Val Acc: 0.272
  Epoch: 03 | Time: 0m 9s | Train Acc: 0.302 | Val Acc: 0.301
  Epoch: 04 | Time: 0m 11s | Train Acc: 0.335 | Val Acc: 0.319
  Epoch: 05 | Time: 0m 10s | Train Acc: 0.373 | Val Acc: 0.238
  Epoch: 06 | Time: 0m 10s | Train Acc: 0.393 | Val Acc: 0.338
  Epoch: 07 | Time: 0m 11s | Train Acc: 0.422 | Val Acc: 0.281
  Epoch: 08 | Time: 0m 12s | Train Acc: 0.451 | Val Acc: 0.323
  Epoch: 09 | Time: 0m 12s | Train Acc: 0.477 | Val Acc: 0.302
  Epoch: 10 | Time: 0m 12s | Train Acc: 0.501 | Val Acc: 0.334
  Epoch: 11 | Time: 0m 11s | Train Acc: 0.523 | Val Acc: 0.316
  Epoch: 12 | Time: 0m 11s | Train Acc: 0.545 | Val Acc: 0.343
  Epoch: 13 | Time: 0m 13s | Train Acc: 0.566 | Val Acc: 0.315
  Epoch: 14 | Time: 0m 10s | Train Acc: 0.586 | Val Acc: 0.323
  Epoch: 15 | Time: 0m 10s | Train Acc: 0.601 | Val Acc: 0.332
Experiement Co

KeyboardInterrupt: 

In [None]:
print(f"Generating plots for {len(all_histories)} models...\n")

for model_name, history in all_histories.items():
    Plotter.plot_history(history, title=model_name)

In [None]:
df_results = pd.DataFrame(all_summaries)

cols = [
    "Model",
    "Type",
    "BiDir",
    "Embeds",
    "Best Val Acc",
    "Best Val Loss",
    "Time (s)",
    "Parameters",
]
cols = [c for c in cols if c in df_results.columns]
df_results = df_results[cols + [c for c in df_results.columns if c not in cols]]

display(df_results.sort_values(by="Best Val Acc", ascending=False))

csv_path = "sst_experiment_results.csv"
df_results.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")

## How can we improve our model?

In [None]:
phase3_experiments = [
    # The winner from Phase 2 (Frozen, No Attention)
    {
        "name": "Baseline_Winner",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": False,
        "freeze_embeddings": True,
    },
    # Optimization 1: Unfreeze Embeddings (Fine-Tuning)
    {
        "name": "FineTuned_Embeddings",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": False,
        "freeze_embeddings": False,  # <--- Unlocked
    },
    # Optimization 2: Add Attention (with frozen embeddings)
    {
        "name": "Attention_Frozen",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": True,
        "freeze_embeddings": True,  # <--- Attention ON
    },
    # Optimization 3: The "Kitchen Sink" (Attention + Fine-Tuning)
    {
        "name": "Best_Model_Full_Opt",
        "rnn_type": "lstm",
        "n_layers": 2,
        "bidirectional": True,
        "use_pretrained": True,
        "use_attention": True,
        "freeze_embeddings": False,  # <--- Both ON
    },
]

In [None]:
all_histories = {}  # Key: Model Name, Value: History Dict
all_summaries = []  # List of Summary Dicts for DataFrame

for config in experiments_config:
    print(f"\n{'='*20} Running: {config['name']} {'='*20}")

    # 1. Init Model
    # Note: We strictly force attention=False and freeze=True for valid Phase 1/2 comparison
    model = DynamicRNN(
        vocab_size=VOCAB_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=OUTPUT_DIM,
        n_layers=config["n_layers"],
        bidirectional=config["bidirectional"],
        dropout=DROPOUT,
        pad_idx=PAD_IDX,
        rnn_type=config["rnn_type"],
        use_attention=False,  # Optimization reserved for Phase 3
        freeze_embeddings=True,  # We want to compare Random vs Static Pretrained first
    )

    # 2. Embeddings
    if config["use_pretrained"]:
        if PRETRAINED_VECTORS is not None:
            print(f"Loading vectors...")
            model.embedding.weight.data.copy_(PRETRAINED_VECTORS)
        else:
            print("Vectors not found, using random.")

    # 3. Hyperparams for record keeping
    hyperparams = {
        "Type": config["rnn_type"].upper(),
        "BiDir": config["bidirectional"],
        "Embeds": "Pretrained" if config["use_pretrained"] else "Random",
        "Layers": config["n_layers"],
    }

    # 4. Train
    trainer = Trainer(model, DEVICE)
    history, summary = trainer.run_experiment(
        train_iter,
        val_iter,
        epochs=EPOCHS,
        lr=LEARNING_RATE,
        name=config["name"],
        hyperparameters=hyperparams,
        save_weights=True,
    )

    # 5. Store Data
    all_histories[config["name"]] = history
    all_summaries.append(summary)

print("\nAll models trained successfully.")

In [None]:
# --- Cell: Visualize and Report ---
print(f"Generating plots for {len(all_histories)} models...\n")

# You can adjust grid size or plotting logic here
for model_name, history in all_histories.items():
    Plotter.plot_history(history, title=model_name)

In [None]:
# Create and Display Dataframe
df_results = pd.DataFrame(all_summaries)

# Reorder columns for readability
cols = [
    "Model",
    "Type",
    "BiDir",
    "Embeds",
    "Best Val Acc",
    "Best Val Loss",
    "Time (s)",
    "Parameters",
]
# Filter to ensure columns exist before selecting
cols = [c for c in cols if c in df_results.columns]
df_results = df_results[cols + [c for c in df_results.columns if c not in cols]]

# Display Leaderboard
print("\n--- Experiment Leaderboard (Sorted by Validation Accuracy) ---")
display(df_results.sort_values(by="Best Val Acc", ascending=False))

# Export
csv_path = "sst_phase1_2_results.csv"
df_results.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")