# Premier entraînement sur les données Kaggle (binaire)

In [None]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from collections import Counter
from sklearn.metrics import classification_report


In [None]:
MAX_VOCAB_SIZE = 20000  # Taille du vocabulaire max
MAX_LEN = 256           # Longueur max des séquences
EMBED_DIM = 100         # Dimension des embeddings
HIDDEN_DIM = 128        # Taille des couches LSTM
BATCH_SIZE = 16         # Taille des batchs
EPOCHS = 10              # Nombre d'époques

In [None]:
# Load datasets
train_df_kaggle = pd.read_csv("data/kaggle/preprocessed/train.csv")
test_df_kaggle = pd.read_csv("data/kaggle/preprocessed/test.csv")

In [None]:
def tokenize(text):
    return text.split()

def encode(vocab, text):
    return [vocab.get(tok, 1) for tok in tokenize(text)[:MAX_LEN]]

In [None]:
# Build vocabulary
counter_kaggle = Counter()
for text in train_df_kaggle["text"]:
    tokens_kaggle = tokenize(text)
    counter_kaggle.update(tokens)

most_common_kaggle = counter_kaggle.most_common(MAX_VOCAB_SIZE - 2)
vocab_kaggle = {"<PAD>": 0, "<UNK>": 1}
for i, (word, _) in enumerate(most_common_kaggle, start=2):
    vocab_kaggle[word] = i

In [None]:
# Custom Dataset
class TextDataset(Dataset):
    def __init__(self, df, vocab, max_len=512):
        self.texts = [torch.tensor(encode(vocab, text), dtype=torch.long) for text in df["text"]]
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Create attention mask
        attention_mask = torch.ones_like(text)  # 1 for real tokens, 0 for padding
        
        # Ensure the text is of max_len
        if len(text) > self.max_len:
            text = text[:self.max_len]
            attention_mask = attention_mask[:self.max_len]
        else:
            # Padding
            padding_length = self.max_len - len(text)
            text = torch.cat([text, torch.zeros(padding_length, dtype=torch.long)], dim=0)
            attention_mask = torch.cat([attention_mask, torch.zeros(padding_length, dtype=torch.long)], dim=0)

        return {
            "input_ids": text,
            "attention_mask": attention_mask,
            "labels": label
        }

In [None]:
def collate_fn(batch):
    # Sort batch by length for packing
    texts = [item["input_ids"] for item in batch]
    labels = [item["labels"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    
    # Pad sequences
    texts = pad_sequence(texts, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    # Convert labels to tensor
    labels = torch.tensor(labels)
    
    # Return a dictionary
    return {
        "input_ids": texts,
        "attention_mask": attention_masks,
        "labels": labels
    }

In [None]:
# DataLoaders
train_ds_kaggle = TextDataset(train_df_kaggle, vocab)
test_ds_kaggle = TextDataset(test_df_kaggle, vocab)

train_loader_kaggle = DataLoader(train_ds_kaggle, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader_kaggle = DataLoader(test_ds_kaggle, batch_size=32, collate_fn=collate_fn)

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, output_dim, max_len=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = self._generate_positional_encoding(max_len, embed_dim)
        self.dropout = nn.Dropout(dropout)

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embed_dim, output_dim)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids) + self.positional_encoding[:input_ids.size(1)].unsqueeze(0).to(input_ids.device)
        x = self.dropout(x)

        if attention_mask is not None:
            # Transformer expects 0 for attend, -inf for mask; convert accordingly
            mask = (attention_mask == 0).to(torch.bool)
        else:
            mask = None

        x = self.transformer_encoder(x, src_key_padding_mask=mask)
        cls_output = x[:, 0, :]  # Use first token ([CLS]-like)
        return self.fc(cls_output)

    def _generate_positional_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe


In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Initialize model

In [None]:
# Model
model_kaggle = TransformerClassifier(
    vocab_size=MAX_VOCAB_SIZE,
    embed_dim=EMBED_DIM, 
    hidden_dim=HIDDEN_DIM, 
    num_heads=2,
    num_layers=2,
    output_dim=len(train_df_kaggle["label"].unique())
)
model_kaggle = model_kaggle.to(device)

# Optimizer / Loss
optimizer_kaggle = torch.optim.AdamW(model_kaggle.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Training
for epoch in range(20):
    model_kaggle.train()
    total_loss = 0
    for batch in train_loader_kaggle:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_kaggle.zero_grad()
        outputs = model_kaggle(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_kaggle.step()
        total_loss += loss.item()
        mean_loss = total_loss/len(train_loader_kaggle)
    print(f"Epoch {epoch+1} Loss_Total: {total_loss:.4f}; Mean_Loss: {mean_loss:.4f}")


In [None]:
model_kaggle.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader_kaggle:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model_kaggle(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds))



# Deuxième entraînement sur les données ISOT (binaire)
C'est exactement la même chose. On donne un autre nom au modèle pour pouvoir les conserver et les évaluer plus tard pour la généralisation


In [None]:
# Load datasets
train_df_isot = pd.read_csv("data/isot/preprocessed/train.csv")
test_df_isot = pd.read_csv("data/isot/preprocessed/test.csv")

In [None]:
# Build vocabulary
counter_isot = Counter()
for text in train_df_isot["text"]:
    tokens = tokenize(text)
    counter_isot.update(tokens)

most_common_isot = counter_isot.most_common(MAX_VOCAB_SIZE - 2)
vocab_isot = {"<PAD>": 0, "<UNK>": 1}
for i, (word, _) in enumerate(most_common_isot, start=2):
    vocab_isot[word] = i

In [None]:
# DataLoaders
train_ds_isot = TextDataset(train_df_isot, vocab)
test_ds_isot = TextDataset(test_df_isot, vocab)

train_loader_isot = DataLoader(train_ds_isot, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader_isot = DataLoader(test_ds_isot, batch_size=32, collate_fn=collate_fn)

In [None]:
# Model
model_isot = TransformerClassifier(
    vocab_size=MAX_VOCAB_SIZE,
    embed_dim=EMBED_DIM, 
    hidden_dim=HIDDEN_DIM, 
    num_heads=2,
    num_layers=2,
    output_dim=len(train_df_isot["label"].unique())
)
model_isot = model_isot.to(device)

# Optimizer / Loss
optimizer_isot = torch.optim.AdamW(model_isot.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Training
for epoch in range(20):
    model_isot.train()
    total_loss = 0
    for batch in train_loader_isot:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_isot.zero_grad()
        outputs = model_isot(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_isot.step()
        total_loss += loss.item()
        mean_loss = total_loss/len(train_loader_isot)
    print(f"Epoch {epoch+1} Loss_Total: {total_loss:.4f}; Mean_Loss: {mean_loss:.4f}")


In [None]:
model_isot.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader_isot:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model_isot(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds))



# Généralisation : Entrainement sur Kaggle et évaluation sur ISOT

In [None]:
model_kaggle.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader_isot:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model_kaggle(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds))

# Généralisation : Entrainement sur ISOT et évaluation sur Kaggle

In [None]:
model_isot.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader_kaggle:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model_isot(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds))