# LSTM Model Experiments

In [None]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from collections import Counter
from sklearn.metrics import classification_report

In [None]:
MAX_VOCAB_SIZE = 20000  # Taille du vocabulaire max
MAX_LEN = 256           # Longueur max des séquences
EMBED_DIM = 100         # Dimension des embeddings
HIDDEN_DIM = 128        # Taille des couches LSTM
BATCH_SIZE = 16         # Taille des batchs
EPOCHS = 5              # Nombre d'époques

In [None]:
# Load datasets
train_df = pd.read_csv("data/kaggle/preprocessed/train.csv")
test_df = pd.read_csv("data/kaggle/preprocessed/test.csv")

train_df = pd.read_csv("data/isot/preprocessed/train.csv")
test_df = pd.read_csv("data/isot/preprocessed/test.csv")

In [None]:
def tokenize(text):
    return text.split()

def encode(vocab, text):
    return [vocab.get(tok, 1) for tok in tokenize(text)[:MAX_LEN]]

In [None]:
# Build vocabulary
counter = Counter()
for text in train_df["text"]:
    tokens = tokenize(text)
    counter.update(tokens)

most_common = counter.most_common(MAX_VOCAB_SIZE - 2)
vocab = {"<PAD>": 0, "<UNK>": 1}
for i, (word, _) in enumerate(most_common, start=2):
    vocab[word] = i

In [None]:
# Custom Dataset
class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = [torch.tensor(encode(vocab, text), dtype=torch.long) for text in df["text"]]
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [None]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0)
    texts = texts[:, :MAX_LEN]  # truncate if needed
    return texts, torch.tensor(labels)

In [None]:
from torch.utils.data import DataLoader, random_split

# Validation split
val_ratio = 0.2
total_len = len(train_df)
val_len = int(total_len * val_ratio)
train_len = total_len - val_len

# Full train dataset
full_train_ds = TextDataset(train_df)

# Split train/val
train_ds, val_ds = random_split(full_train_ds, [train_len, val_len], generator=torch.Generator().manual_seed(42))

# DataLoaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Test DataLoader
test_ds = TextDataset(test_df)
test_loader = DataLoader(test_ds, batch_size=32, collate_fn=collate_fn)


In [None]:
# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab["<PAD>"])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        return self.fc(hidden[-1])

In [None]:
import copy

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize model
model = LSTMClassifier(len(vocab), embedding_dim=100, hidden_dim=128, output_dim=len(train_df["label"].unique()))
model = model.to(device)

# Optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Early stopping parameters
best_val_loss = float('inf')
patience = 10
counter = 0
best_model = None

for epoch in range(50):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()

    print(f"Epoch {epoch+1} - Train Loss: {total_loss:.4f} - Val Loss: {val_loss:.4f}")

    # Check early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        best_model = copy.deepcopy(model.state_dict())
        print("→ Validation loss improved, model saved.")
    else:
        counter += 1
        print(f"→ No improvement. Patience: {counter}/{patience}")
        if counter >= patience:
            print("Early stopping triggered.")
            break

# Load best model
model.load_state_dict(best_model)


In [None]:
# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        outputs = model(x_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

print(classification_report(all_labels, all_preds))
