In [None]:
from pathlib import Path
from typing import List, Dict

from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DistilBertConfig, DistilBertModel, DistilBertTokenizerFast
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


import torch
import numpy as np
import pandas as pd
import accelerate
import transformers
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


### UTILS CLASS


In [None]:
from enum import Enum
class Label(Enum):
    DOG = "dog"
    BIKE = "bike"
    BALL = "ball"
    WATER = "water"    


### UTILS METHODS


In [None]:
def get_label(filename: str):
    return filename.split("_")[0]


def get_uuid(filename: str):
    name = Path(filename).stem          
    parts = name.split("_")
    return "_".join(parts[:2])          


def build_augmented_path(img_path: Path, base_dir: Path):
    img_path = Path(img_path)
    filename = img_path.name
    label = get_label(filename)
    uuid = get_uuid(filename)
    print(uuid)

    return base_dir / label / uuid / filename




In [None]:
def normalize_label(lbl):
    if isinstance(lbl, Label):
        return lbl.value 
    if isinstance(lbl, str) and lbl.startswith("Label."):
        return lbl.split(".")[1].lower()  
    return lbl.lower()  

def encode_labels(labels, class_to_idx):
    cleaned = [normalize_label(lbl) for lbl in labels]
    return np.array([class_to_idx[lbl] for lbl in cleaned], dtype=np.int64)

classes = [lbl.value for lbl in Label]  
class_to_idx = {c: i for i, c in enumerate(classes)}
print(class_to_idx)

In [None]:
def count_oov_pct(sequences, oov_id):
    total = sum(1 for seq in sequences for tid in seq if tid != 0)
    oov = sum(1 for seq in sequences for tid in seq if tid == oov_id)
    return oov/total*100 if total > 0 else 0

In [None]:
def normalize_label(lbl):
    if hasattr(lbl, "value"):  # Enum Label
        return str(lbl.value).lower()
    s = str(lbl)
    if s.startswith("Label."):
        return s.split(".", 1)[1].lower()
    return s.lower()


In [None]:
def encode_labels(labels, mapping):
    cleaned = [normalize_label(lbl) for lbl in labels]
    return np.array([mapping[c] for c in cleaned], dtype=np.int64)

### PREPROCESSING METHODS

In [None]:
def texts_to_padded_ids(texts, tok, max_vocab, seq_len, pad_id=0, oov_id=1):
    seqs = tok.texts_to_sequences(texts)

    # Force les ids dans [0..max_vocab-1], sinon -> OOV
    seqs = [[t if (0 <= t < max_vocab) else oov_id for t in s] for s in seqs]

    padded = pad_sequences(
        seqs,
        maxlen=seq_len,
        padding="post",
        truncating="post",
        value=pad_id
    ).astype(np.int64)

    return padded


### SMALLBERT MODEL

In [None]:
class PositionalEmbedding(nn.Module):
    def __init__(self, sequence_length: int, vocab_size:int, embed_dim:int):
        super().__init__()
        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.position_embeddings = nn.Embedding(sequence_length, embed_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len = x.size()
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        return self.token_embeddings(x) + self.position_embeddings(positions)
    
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, dropout_rate: float = 0.1) -> None:
        super().__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout_rate, batch_first=True)

        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )

        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x: torch.Tensor, padding_mask: torch.Tensor = None) -> torch.Tensor:
        attn_output, _ = self.att(
            x, x, x,
            key_padding_mask=padding_mask  # <-- correct masking
        )

        x = self.layernorm1(x + self.dropout1(attn_output))
        ffn_output = self.ffn(x)
        out = self.layernorm2(x + self.dropout2(ffn_output))
        return out


In [None]:
class SmallBERT(nn.Module):
    def __init__(self, sequence_length: int, vocab_size: int, embed_dim: int,
                 num_heads: int, ff_dim: int, num_layers: int) -> None:
        super().__init__()
        self.pos_embedding = PositionalEmbedding(sequence_length, vocab_size, embed_dim)
        self.blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim)
            for _ in range(num_layers)
        ])
        self.layernorm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len) token ids
        """
        padding_mask = (x == 0)  #0= PAD
        x = self.pos_embedding(x)

        for block in self.blocks:
            x = block(x, padding_mask=padding_mask)

        x = self.layernorm(x)
        return self.dropout(x)


In [None]:
class SmallBERTPourClassification(nn.Module):
    def __init__(self, sequence_length: int, vocab_size: int, embed_dim: int,
                 num_heads: int, ff_dim: int, num_layers: int,
                 num_classes: int = 4) -> None:
        super().__init__()

        self.encoder = SmallBERT(sequence_length, vocab_size, embed_dim, num_heads, ff_dim, num_layers)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        enc = self.encoder(x)             # (batch, seq_len, embed_dim)
        pooled = enc.mean(dim=1)          #mean pooling
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)
        return logits  # logits only, pas softmax


### TRAINING METHODS

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_score = None
        self.counter = 0
        self.should_stop = False

    def step(self, metric):
        if self.best_score is None:
            self.best_score = metric
        elif metric < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True
        else:
            self.best_score = metric
            self.counter = 0


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(model, loader, device):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            pred = logits.argmax(dim=1)
            preds.extend(pred.cpu().numpy())
            labels.extend(y.cpu().numpy())
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return acc, f1

### VISUALISATION

In [None]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc
)
from sklearn.preprocessing import label_binarize
def plot_losses(train_losses, test_losses):
    plt.figure(figsize=(8, 5))
    plt.plot(train_losses, label="Train loss")
    plt.plot(test_losses, label="Test loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs Test Loss")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.show()
def plot_losses(train_losses, test_losses):
    plt.figure(figsize=(8, 5))
    plt.plot(train_losses, label="Train loss")
    plt.plot(test_losses, label="Test loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs Test Loss")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.show()


def plot_confusion_matrix(y_true, y_pred, class_names=None, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap="Blues", values_format="d", xticks_rotation=45)
    plt.title(title)
    plt.tight_layout()
    plt.show()


def plot_multiclass_roc(y_true, y_prob, num_classes, class_names=None, title="ROC (One-vs-Rest)"):
    """
    y_true: [N] int labels
    y_prob: [N, C] probabilities
    """
    y_true_bin = label_binarize(y_true, classes=list(range(num_classes)))  # [N, C]

    plt.figure(figsize=(8, 6))
    for c in range(num_classes):
        fpr, tpr, _ = roc_curve(y_true_bin[:, c], y_prob[:, c])
        roc_auc = auc(fpr, tpr)
        name = class_names[c] if class_names is not None else f"Class {c}"
        plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.3f})")

    # Diagonal
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.show()

### DISTILLBERT MODEL CLASS

In [None]:
class TextClsDataset(Dataset):
    """
    Retourne un batch standardisé:
    - idx: index dans le dataset
    - input_ids: (L,)
    - attention_mask: (L,)
    - labels: ( )
    - text: str (optionnel, utile pour debug)
    """
    def __init__(self, texts, labels, tokenizer, max_len=128, return_text=True):
        self.texts = list(texts)
        self.labels = np.asarray(labels, dtype=np.int64)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.return_text = return_text

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {
            "idx": torch.tensor(idx, dtype=torch.long),
            "input_ids": enc["input_ids"].squeeze(0).to(torch.long),
            "attention_mask": enc["attention_mask"].squeeze(0).to(torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }
        if self.return_text:
            item["text"] = text
        return item

def make_text_loaders(X_train, y_train, X_val, y_val, X_test, y_test,
                      model_name="distilbert-base-uncased", max_len=128,
                      batch_size=32):
    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")

    train_ds = TextClsDataset(X_train, y_train, tokenizer, max_len=max_len)
    val_ds   = TextClsDataset(X_val,   y_val,   tokenizer, max_len=max_len)
    test_ds  = TextClsDataset(X_test,  y_test,  tokenizer, max_len=max_len)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

    return tokenizer, train_loader, val_loader, test_loader


### DISTILLBERT TRAINING METHODS


In [None]:
def to_device_batch(batch, device):
    out = {}
    for k, v in batch.items():
        if torch.is_tensor(v):
            out[k] = v.to(device, non_blocking=True)
        else:
            out[k] = v
    return out

def forward_text(model, batch, arch="hf"):
    """
    essai de méthode générique
    archicture:
      - "hf": DistilBERT/Transformers => model(**) retourne un objet avec .logits
      - "smallbert": ton modèle => model(input_ids) -> logits
    """
    if arch == "hf":
        out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        logits = out.logits
        return logits
    elif arch == "smallbert":
        # ton SmallBERT doit accepter input_ids (LongTensor)
        logits = model(batch["input_ids"])
        return logits
    else:
        raise ValueError(f"arch inconnu: {arch}")

def train_one_epoch_text(model, loader, optimizer, criterion, device, epoch, epochs, arch="hf"):
    model.train()
    running_loss = 0.0

    loop = tqdm(loader, total=len(loader), desc=f"Epoch {epoch+1}/{epochs} [TRAIN]")
    for i, batch in enumerate(loop):
        batch = to_device_batch(batch, device)
        optimizer.zero_grad(set_to_none=True)

        logits = forward_text(model, batch, arch=arch)
        loss = criterion(logits, batch["labels"])

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        running_loss += loss.item()
        loop.set_postfix(loss=running_loss / (i + 1))

    return running_loss / len(loader)

@torch.no_grad()
def evaluate_text(model, loader, criterion, device, epoch, epochs, num_classes, arch="hf",
                  class_names=None, n_mistakes_to_print=5):
    model.eval()
    running_loss = 0.0

    all_labels, all_preds, all_probs = [], [], []
    mistakes_printed = 0

    loop = tqdm(loader, total=len(loader), desc=f"Epoch {epoch+1}/{epochs} [EVAL]")
    for batch in loop:
        batch = to_device_batch(batch, device)

        logits = forward_text(model, batch, arch=arch)
        loss = criterion(logits, batch["labels"])
        running_loss += loss.item()

        probs = torch.softmax(logits, dim=1)
        preds = probs.argmax(dim=1)


        all_labels.append(batch["labels"].detach().cpu())
        all_preds.append(preds.detach().cpu())
        all_probs.append(probs.detach().cpu())

        # debug erreurs (optionnel)
        if ("text" in batch) and mistakes_printed < n_mistakes_to_print:
            mism = (preds != batch["labels"]).detach().cpu()
            if mism.any():
                pos_list = torch.where(mism)[0].tolist()
                for pos in pos_list:
                    if mistakes_printed >= n_mistakes_to_print:
                        break
                    true_i = int(batch["labels"][pos].item())
                    pred_i = int(preds[pos].item())
                    t = batch["text"][pos]  # str
                    tn = class_names[true_i] if class_names else str(true_i)
                    pn = class_names[pred_i] if class_names else str(pred_i)
                    print("\n--- Mauvaise prédiction (texte) ---")
                    print(f"true: {tn} ({true_i}) | pred: {pn} ({pred_i})")
                    print(f"text: {t[:300]}{'...' if len(t) > 300 else ''}")
                    mistakes_printed += 1

    avg_loss = running_loss / len(loader)
    y_true = torch.cat(all_labels).numpy()
    y_pred = torch.cat(all_preds).numpy()
    y_prob = torch.cat(all_probs).numpy()
    acc = float((torch.tensor(y_pred) == torch.tensor(y_true)).float().mean().item())

    return avg_loss, y_true, y_pred, y_prob, acc


In [None]:
def ensure_dir(path: Path) -> None:
    path.mkdir(parents=True, exist_ok=True)

def fit_text(
    model,
    train_loader,
    test_loader,
    num_classes=4,
    epochs=10,
    lr=2e-5,
    patience=3,
    models_dir=Path("../models"),
    best_name="best_text_model.pth",
    criterion=None,
    class_names=None,
    arch="hf",                 # "hf"  "smallbert"
    optimizer_name="adamw"     # "adamw"  "sgd"
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    if criterion is None:
        criterion = nn.CrossEntropyLoss()

    if optimizer_name.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=lr)

    ensure_dir(models_dir)
    best_model_path = models_dir / best_name

    best_test_loss = float("inf")
    patience_counter = 0

    train_losses, test_losses = [], []

    for epoch in range(epochs):
        train_loss = train_one_epoch_text(
            model, train_loader, optimizer, criterion, device, epoch, epochs, arch=arch
        )
        train_losses.append(train_loss)
        print(f"\nEpoch {epoch+1} - Average TRAIN loss: {train_loss:.4f}")

        test_loss, y_true, y_pred, y_prob, acc = evaluate_text(
            model, test_loader, criterion, device, epoch, epochs, num_classes=num_classes,
            arch=arch, class_names=class_names, n_mistakes_to_print=0
        )
        test_losses.append(test_loss)

        print(f"Epoch {epoch+1} - Average TEST loss: {test_loss:.4f} | acc={acc:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_true, y_pred, digits=3, target_names=class_names))

        if test_loss < best_test_loss:
            best_test_loss = test_loss
            patience_counter = 0
            torch.save(model.state_dict(), best_model_path)
            print(f"Nouveau meilleur modèle sauvegardé (test loss: {best_test_loss:.4f})")
        else:
            patience_counter += 1
            print(f"Pas d'amélioration ({patience_counter}/{patience})")
            if patience_counter >= patience:
                print(f"\nEarly stopping déclenché après {epoch+1} époques")
                print(f"Meilleur test loss: {best_test_loss:.4f}")
                break

    print(f"\nChargement du meilleur modèle (test loss: {best_test_loss:.4f})")
    model.load_state_dict(torch.load(best_model_path, map_location=device))

    final_test_loss, y_true, y_pred, y_prob, acc = evaluate_text(
        model, test_loader, criterion, device, epoch=0, epochs=1, 
        arch=arch, class_names=class_names, n_mistakes_to_print=5, num_classes=4
    )
    print(f"Best model - TEST loss: {final_test_loss:.4f} | acc={acc:.4f}")

    # Plots (tu peux réutiliser tes fonctions inchangées)
    plot_losses(train_losses, test_losses)
    plot_confusion_matrix(y_true, y_pred, class_names=class_names, title="Confusion Matrix (Best Model)")
    plot_multiclass_roc(y_true, y_prob, num_classes=num_classes, class_names=class_names, title="ROC (Best Model, OvR)")

    return model, {"train_losses": train_losses, "test_losses": test_losses, "best_test_loss": best_test_loss}


### DATASET LOADING

In [None]:
metadata_path = Path("../data/final_dataset/metadata.csv")
df = pd.read_csv(metadata_path)
print(df.columns)
print(df.iloc[1])

In [None]:
df_clean = df.drop_duplicates(subset="caption").reset_index(drop=True)

df_train, df_temp = train_test_split(df_clean, test_size=0.3, random_state=11)
df_test, df_val = train_test_split(df_temp, test_size=0.5, random_state=11)

print(df_train["label"].value_counts(normalize=True) * 100)
print(df_val["label"].value_counts(normalize=True) * 100)
print(df_test["label"].value_counts(normalize=True) * 100)

X_train, y_train, caption_train = df_train["image_path"], df_train["label"], df_train["caption"]
X_val, y_val, caption_val  = df_val["image_path"], df_val["label"], df_val["caption"]
X_test, y_test, caption_test   = df_test["image_path"], df_test["label"], df_test["caption"]

X_train = caption_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = caption_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_test = caption_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print("Avant :", len(df))
print("Après  :", len(df_clean))
print("Doublons supprimés :", len(df) - len(df_clean))



In [None]:
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df["caption"])
actual_vocab_size = len(tokenizer.word_index) + 1
print(f"   taille de vocabulaire: {len(tokenizer.word_index)}")
print(f"   vocab_size: {actual_vocab_size}")

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [None]:
print(X_train[0])
print(X_train_seq[0])
print(X_test[0])
print(X_test_seq[0])

In [None]:
oov_id = tokenizer.word_index["<OOV>"]

count = 0
for i in range(len(X_val_seq)):
    count += X_val_seq[i].count(oov_id)

print(count)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_len, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_seq_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_len, padding='post', truncating='post')


In [None]:
MAX_VOCAB = 1000     # tokens autorisés: 0..999
PAD_ID = 0
OOV_TOKEN = "<OOV>"
SEQ_LEN = 104 

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(df_clean["caption"])

oov_id = tokenizer.word_index[OOV_TOKEN]
print("oov_id:", oov_id)

X_train_padded = texts_to_padded_ids(X_train, tokenizer, MAX_VOCAB, SEQ_LEN, PAD_ID, oov_id)
X_val_padded   = texts_to_padded_ids(X_val,   tokenizer, MAX_VOCAB, SEQ_LEN, PAD_ID, oov_id)
X_test_padded  = texts_to_padded_ids(X_test,  tokenizer, MAX_VOCAB, SEQ_LEN, PAD_ID, oov_id)

word_counts = tokenizer.word_counts
total = sum(word_counts.values())

def coverage(k):
    topk = sorted(word_counts.values(), reverse=True)[:k]
    return sum(topk) / total

for k in [1000, 2000, 3000, 5000]:
    print(k, f"{coverage(k)*100:.1f}%")
 
classes = [lbl.value for lbl in Label]
classes = [c.lower() for c in classes]
class_to_idx = {c: i for i, c in enumerate(classes)}
print("class_to_idx:", class_to_idx)

def normalize_text(t):
    t = t.lower()
    t = re.sub(r"\d+", " ", t)                 # supprime les nombres
    t = re.sub(r"[^\w\s]", " ", t)             # ponctuation
    t = re.sub(r"\s+", " ", t).strip()
    return t
    words = [lemmatizer.lemmatize(w) for w in t.split()]
    return " ".join(words)


df_clean["caption"] = df_clean["caption"].apply(normalize_text)
df_clean["caption"].iloc[1]

lengths = df_clean["caption"].apply(lambda x: len(x.split()))

print(lengths.describe())

def normalize_label(lbl):
    s = str(lbl)
    if s.startswith("Label."):
        s = s.split(".", 1)[1]
    return s.lower()

df_clean["label"] = df_clean["label"].apply(normalize_label)

p95 = int(lengths.quantile(0.95))
p98 = int(lengths.quantile(0.98))
max_len = int(lengths.max())

print(f"p95={p95}, p98={p98}, max={max_len}")


In [None]:

y_train_np = encode_labels(y_train, class_to_idx)
y_val_np   = encode_labels(y_val, class_to_idx)
y_test_np  = encode_labels(y_test, class_to_idx)
num_classes = len(classes)


In [None]:
oov_id = tokenizer.word_index.get("<OOV>", 1)

print(f"\n<OOV>pourcentage:")
print(f"  Train: {count_oov_pct(X_train_seq, oov_id):.1f}%")
print(f"  Val:   {count_oov_pct(X_val_seq, oov_id):.1f}%")
print(f"  Test:  {count_oov_pct(X_test_seq, oov_id):.1f}%")

In [None]:
X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
X_val_t   = torch.tensor(X_val_padded,   dtype=torch.long)
X_test_t  = torch.tensor(X_test_padded,  dtype=torch.long)

y_train_t = torch.tensor(y_train_np, dtype=torch.long)
y_val_t   = torch.tensor(y_val_np,   dtype=torch.long)
y_test_t  = torch.tensor(y_test_np,  dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val_t,   y_val_t),   batch_size=32, shuffle=False)
test_loader  = DataLoader(TensorDataset(X_test_t,  y_test_t),  batch_size=32, shuffle=False)


In [None]:
# train_ds = TensorDataset(torch.tensor(X_train_padded), torch.tensor(y_train_np))
# val_ds   = TensorDataset(torch.tensor(X_val_padded), torch.tensor(y_val_np))
# test_ds  = TensorDataset(torch.tensor(X_test_padded), torch.tensor(y_test_np))

# train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
# val_loader   = DataLoader(val_ds, batch_size=32)
# test_loader  = DataLoader(test_ds, batch_size=32)

### SMALLBERT TRAINING

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_VOCAB = 1000      
SEQ_LEN = 128         
NUM_CLASSES = 4

model = SmallBERTPourClassification(
    vocab_size=MAX_VOCAB,      
    sequence_length=SEQ_LEN,   
    embed_dim=128,
    num_heads=4,
    ff_dim=256,
    num_layers=2,
    num_classes=NUM_CLASSES
).to(device)

model.eval()

all_preds, all_labels = [], []
all_probs = []
embeddings = []
labels = []


with torch.no_grad():
    for x, y in val_loader:
        x = x.to(device)
        enc = model.encoder(x)  #(batch, seq_len, embed_dim)
        emb = enc.mean(dim=1).cpu().numpy()  # (batch, emd_dim)

        embeddings.append(emb)
        labels.extend(y.numpy())

        logits = model(x)
        probs = F.softmax(logits, dim=-1)  
        pred = probs.argmax(dim=1)
        
        all_preds.extend(pred.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(y.numpy())

embeddings = np.vstack(embeddings)
labels = np.array(labels)
all_probs = np.array(all_probs)


tsne = TSNE(n_components=2, learning_rate="auto", init="pca")
emb2d = tsne.fit_transform(embeddings)

plt.figure(figsize=(7,6))
plt.scatter(emb2d[:,0], emb2d[:,1], c=labels, cmap="tab10", s=12)
plt.title("t-SNE sur embeddings (avant entraînement)")
plt.colorbar()
plt.show()


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

early = EarlyStopping(patience=10)

train_losses = []
val_losses = []

models_dir = Path("../models")
models_dir.mkdir(parents=True, exist_ok=True)
best_model_path = models_dir / "best-model-smallbert.pth"
best_val_loss = float("inf")

for epoch in range(100):
    model.train()
    total_loss = 0

    for x, y in train_loader:
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    model.eval()
    all_preds, all_true = [], []
    val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            val_loss += loss.item()
            pred = logits.argmax(dim=1)
            all_preds.extend(pred.cpu().numpy())
            all_true.extend(y.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    val_acc = accuracy_score(all_true, all_preds)
    val_f1 = f1_score(all_true, all_preds, average="macro")

    print(f"Epoch {epoch+1} | train_loss={avg_train_loss:.4f} | val_loss={avg_val_loss:.4f} | acc={val_acc:.4f} | f1={val_f1:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), best_model_path)
        print(f" Meilleur modèle sauvegardé (val_loss: {best_val_loss:.4f})")

    early.step(val_f1)
    if early.should_stop:
        print("Early stopping triggered.")
        break



In [None]:
print(f"\nChargement du meilleur modèle (val_loss: {best_val_loss:.4f})")
model.load_state_dict(torch.load(best_model_path, map_location=device))

plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Train loss")
plt.plot(val_losses, label="Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss - SmallBERT")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
fit_text(
    model,
    train_loader,
    test_loader,
    arch="smallbert",
    class_names=["dog", "bike", "ball", "water"],
    num_classes=4,
    lr=1e-4,
    epochs=20
)


### SMALLBERT EVAL

In [None]:
model.eval()

all_preds, all_labels = [], []
all_probs = []
embeddings = []
labels = []


with torch.no_grad():
    for x, y in val_loader:
        x = x.to(device)
        enc = model.encoder(x)  #(batch, seq_len, embed_dim)
        emb = enc.mean(dim=1).cpu().numpy()  # (batch, embed_dim)

        embeddings.append(emb)
        labels.extend(y.numpy())

        logits = model(x)
        probs = F.softmax(logits, dim=-1)  
        pred = probs.argmax(dim=1)
        
        all_preds.extend(pred.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(y.numpy())

embeddings = np.vstack(embeddings)
labels = np.array(labels)
all_probs = np.array(all_probs)

test_acc = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average="macro")
cm = confusion_matrix(all_labels, all_preds)
idx_to_class = {v: k for k, v in class_to_idx.items()}
class_names = [idx_to_class[i] for i in range(len(idx_to_class))]

print("Test accuracy:", test_acc)
print("Test F1:", test_f1)
#CONFUSION
fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap="Blues", values_format="d", ax=ax, xticks_rotation=45)
plt.title("Confusion Matrix - SmallBERT")
plt.tight_layout()
plt.show()

#ROC
num_classes = len(class_names)
y_true_bin = label_binarize(all_labels, classes=list(range(num_classes)))

plt.figure(figsize=(8, 6))
for c in range(num_classes):
    fpr, tpr, _ = roc_curve(y_true_bin[:, c], all_probs[:, c])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{class_names[c]} (AUC={roc_auc:.3f})")

plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (One-vs-Rest) - SmallBERT")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

### DISTILLBERT TRAINING

In [None]:

MODEL_NAME = "distilbert-base-uncased"  # ou multilingual si captions FR
tokenizer, train_loader, val_loader, test_loader = make_text_loaders(
    X_train, y_train_np, X_val, y_val_np, X_test, y_test_np,
    model_name=MODEL_NAME, max_len=128, batch_size=32
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)

class_names = ["dog", "bike", "ball", "water"]
model, logs = fit_text(
    model,
    train_loader,
    test_loader,
    num_classes=4,
    epochs=5,
    lr=2e-5,
    patience=2,
    best_name="best_distilbert.pth",
    class_names=class_names,
    arch="hf",
    optimizer_name="adamw"
)


In [None]:

MODEL_NAME = "distilbert-base-uncased" 
tokenizer, train_loader, val_loader, test_loader = make_text_loaders(
    X_train, y_train_np, X_val, y_val_np, X_test, y_test_np,
    model_name=MODEL_NAME, max_len=128, batch_size=32
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)

class_names = ["dog", "bike", "ball", "water"]
model, logs = fit_text(
    model,
    train_loader,
    test_loader,
    num_classes=4,
    epochs=5,
    lr=2e-5,
    patience=2,
    best_name="best_distilbert.pth",
    class_names=class_names,
    arch="hf",
    optimizer_name="adamw"
)
