# Experiment 05: Learning with Noisy Labels (Easter Egg)

**Objective:** Test the robustness of our best model (BiLSTM + FastText) when training labels are unreliable.
**Hypothesis:** Deep learning models can often memorize clean patterns first and ignore random noise, allowing them to learn useful sentiment even when a significant fraction of labels are wrong.

In [None]:
import os, sys
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import preprocessor as p

# Add src to path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.dataset import load_dataset
from src.model.BiLSTMClassifier import EnhancedBiLSTM
from src.utils import inject_noise

## 1. Data Setup

In [None]:
# Load Raw Data
train, _, _, _ = load_dataset()
X = train["TEXT"].values.astype("U")
y = train["Label"].values

# Split Data (Always validate on CLEAN data)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Load Precomputed FastText Embeddings
# Note: Ensure you have run the embedding generation in 03_bilstm_model_experiments.ipynb first
embedding_matrix = np.load("../data/embeddings/embedding_matrix_crawl_subword_300.npy")
vocab = np.load("../data/embeddings/vocab.npy", allow_pickle=True).item()

vocab_size, embedding_dim = embedding_matrix.shape
print(f"Loaded Embeddings: {vocab_size} words, {embedding_dim} dimensions")

In [None]:
# Tokenization Helper
tokenizer = TweetTokenizer()
def encode(text, vocab, max_len=50):
    text = p.tokenize(text)
    tokens = tokenizer.tokenize(text.lower())
    ids = [vocab.get(t, vocab.get("<UNK>", 1)) for t in tokens]
    if len(ids) < max_len:
        ids = ids + [vocab.get("<PAD>", 0)] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

# Prepare Vectors
X_train_ids = np.array([encode(t, vocab) for t in X_train])
X_val_ids = np.array([encode(t, vocab) for t in X_val])

y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_ids, dtype=torch.long)

## 2. Experiment Loop

In [None]:
def train_robustness_check(noise_rate):
    print(f"\n[Experiment] Training with {noise_rate*100}% Noisy Labels...")
    
    # 1. Corrupt the Training Labels
    y_train_noisy = inject_noise(y_train, noise_rate=noise_rate)
    
    # 2. Create Loader
    train_data = TensorDataset(torch.tensor(X_train_ids, dtype=torch.long), 
                               torch.tensor(y_train_noisy, dtype=torch.long))
    train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
    
    # 3. Reset Model
    model = EnhancedBiLSTM(vocab_size, embedding_dim, hidden_dim=128, 
                           num_classes=20, embedding_matrix=embedding_matrix)
    
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    
    # 4. Train
    val_accuracies = []
    for epoch in range(12):  # Shorter run for demo purposes
        model.train()
        for x_b, y_b in train_loader:
            optimizer.zero_grad()
            out = model(x_b)
            loss = criterion(out, y_b)
            loss.backward()
            optimizer.step()
        
        # Validation (Always checking against GROUND TRUTH clean labels)
        model.eval()
        with torch.no_grad():
            logits = model(X_val_tensor)
            preds = logits.argmax(dim=1)
            acc = (preds == y_val_tensor).float().mean().item()
            val_accuracies.append(acc)
            
        print(f"  Epoch {epoch+1}: Val Acc = {acc:.4f}")
        
    return val_accuracies

## 3. Run & Plot

In [None]:
# Baseline: 0% Noise
hist_clean = train_robustness_check(0.0)

# Hard Mode: 30% Noise
hist_noisy = train_robustness_check(0.3)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(hist_clean, label='0% Noise (Clean)', marker='o', color='green')
plt.plot(hist_noisy, label='30% Noise (Corrupted)', marker='x', color='red', linestyle='--')
plt.title('Model Robustness: Impact of Noisy Labels on Learning')
plt.xlabel('Epochs')
plt.ylabel('Validation Accuracy (on Clean Data)')
plt.legend()
plt.grid(True, alpha=0.3)
os.makedirs('results', exist_ok=True)
plt.savefig("results/noisy_labels_impact.png")
plt.show()