2_2_encoder.ipynb
Fine-tuning Camemberta on multi-card user data.

**Pipeline:**
1. Train on user 'cards' (aggregated tweets)
2. Use stratified k-fold split by user (to avoid leakage)
3. Aggregate card predictions to get final user score



In [4]:
#!/usr/bin/env python3

import os
import gc
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

# configuration & reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# directory paths
ROOT_DIR = "."
# ROOT_DIR = "/content/drive/MyDrive/Colab Notebooks/code"  # for google colab
OUT_DIR = os.path.join(ROOT_DIR, "intermediate")

# create output dir if needed
os.makedirs(OUT_DIR, exist_ok=True)

# inputs
TRAIN_CSV = os.path.join(OUT_DIR, "user_cards_train.csv")
TEST_CSV = os.path.join(OUT_DIR, "user_cards_test.csv")

# outputs
# oof = out-of-fold predictions (for training set evaluation)
OUT_OOF = os.path.join(OUT_DIR, "oof_camembert.csv")
OUT_TEST = os.path.join(OUT_DIR, "test_camembert.csv")

# column mapping
TEXT_COL = "prompt"       # input text
LABEL_COL = "user_label"  # target (0 or 1)
ID_COL = "user_key"       # user id for grouping

# model config
# camembertav2 is better for french text
MODEL_NAME = "almanach/camembertav2-base"

# max context size allowed by bert architecture
MAX_LEN = 512

# standard 5-fold cv (80% train / 20% val)
N_FOLDS = 5

# number of full passes over data
EPOCHS = 2

# optimized hyperparameters
# learning rate: tuned low to avoid destabilizing pre-trained weights
BEST_LR = 2.32e-05

# batch sizes
BEST_BS_T = 48  # fit as much as vram allows
BEST_BS_E = 64  # larger for eval since no gradients stored

# weight decay for regularization (prevents overfitting)
BEST_WD = 0.019

# label smoothing: prevents overconfidence by targeting 0.005/0.995 instead of 0/1
LABEL_SMOOTHING = 0.005

# linear decay is standard for transformer fine-tuning
SCHEDULER_TYPE = "linear"


# dataset class

class CardDataset(Dataset):
    """
    pytorch dataset wrapper
    connects raw text list to dataloader
    """

    def __init__(self, texts, labels=None, tokenizer=None, max_len=512):
        self.texts = list(texts)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        """
        fetches i-th sample
        tokenization done on-the-fly to save ram
        """
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,        # hard cut if text > 512
            max_length=self.max_len,
            return_tensors="pt"
        )

        # squeeze removes batch dim (1, seq) -> (seq)
        # dataloader expects single sample, batch dim added later
        enc = {k: v.squeeze(0) for k, v in enc.items()}

        # add label if available (training mode)
        if self.labels is not None:
            enc["labels"] = torch.tensor(int(self.labels[idx]), dtype=torch.long)

        return enc


# utility functions

def compute_metrics(eval_pred):
    """
    simple accuracy metric for trainer callback
    """
    logits, labels = eval_pred
    # convert raw scores to class index (0 or 1)
    predictions = logits.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

def softmax(logits):
    """
    converts logits to probabilities (0-1 range)
    substract max for numerical stability to avoid overflow
    """
    exp_logits = np.exp(logits - logits.max(axis=1, keepdims=True))
    return exp_logits / exp_logits.sum(axis=1, keepdims=True)

def predict_texts(model, tokenizer, texts, batch_size=32, max_length=512):
    """
    efficient inference loop
    runs in eval mode without gradients to save memory
    """
    all_logits = []
    device = next(model.parameters()).device
    model.eval() # disable dropout

    with torch.no_grad(): # disable gradient graph construction
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]

            # prep inputs
            enc = tokenizer(
                batch,
                padding=True,       # dynamic padding to longest in batch
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
            # move to device
            enc = {k: v.to(device) for k, v in enc.items()}

            # forward pass
            outputs = model(**enc)
            all_logits.append(outputs.logits.cpu().numpy())

    return np.vstack(all_logits)



# main training pipeline

def main():
    print("Fine-tuning CamemBERTav2")
    print(f"Hyperparameters -> LR: {BEST_LR}, Batch: {BEST_BS_T}, WD: {BEST_WD}")
    print(f"Output Dir: {OUT_DIR}")

    # 1. load and preprocess data
    print("\n1. Loading and cleaning data...")
    try:
        train_cards = pd.read_csv(TRAIN_CSV)
        test_cards = pd.read_csv(TEST_CSV)
    except FileNotFoundError:
        print(f"[error] Files not found. Check {OUT_DIR}")
        return

    # basic cleanup: handle nans
    train_cards[TEXT_COL] = train_cards[TEXT_COL].fillna("").astype(str)

    # remove garbage rows (empty text or no label)
    mask_valid = (train_cards[TEXT_COL].str.len() > 0) & train_cards[LABEL_COL].notna()
    train_cards = train_cards.loc[mask_valid].reset_index(drop=True)
    train_cards[LABEL_COL] = train_cards[LABEL_COL].astype(int)

    test_cards[TEXT_COL] = test_cards[TEXT_COL].fillna("").astype(str)

    print(f"  > Train Set: {len(train_cards)} cards")
    print(f"  > Test Set:  {len(test_cards)} cards")

    # 2. stratification
    print("\n2. Preparing User-Level Cross-Validation...")

    # must split by user, not card
    # otherwise specific user syntax leaks into validation
    user_labels = train_cards.drop_duplicates(ID_COL)[[ID_COL, LABEL_COL]].reset_index(drop=True)
    user_ids = user_labels[ID_COL].values
    labels_for_split = user_labels[LABEL_COL].values

    # setup tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    collator = DataCollatorWithPadding(tokenizer=tokenizer)
    fp16_available = torch.cuda.is_available() # use mixed precision if gpu

    # 3. k-fold cross-validation loop
    print(f"\n3. Starting {N_FOLDS}-Fold Cross-Validation...")

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    # storage for out-of-fold predictions
    oof_proba_cards = np.zeros(len(train_cards), dtype=np.float32)

    # list to accumulate test predictions from each fold model
    test_proba_folds = []

    # cache test texts
    test_texts = test_cards[TEXT_COL].tolist()

    for fold, (tr_user_idx, val_user_idx) in enumerate(
        skf.split(np.zeros_like(labels_for_split), labels_for_split), 1
    ):
        print(f"\nFold {fold}/{N_FOLDS}")

        # map fold indices back to user ids
        tr_users = set(user_ids[tr_user_idx])
        val_users = set(user_ids[val_user_idx])

        # split cards based on user mapping
        tr_mask = train_cards[ID_COL].isin(tr_users)
        val_mask = train_cards[ID_COL].isin(val_users)

        tr_cards = train_cards.loc[tr_mask].reset_index(drop=True)
        val_cards = train_cards.loc[val_mask].reset_index(drop=True)

        # track original indices to fill oof array correctly later
        val_original_indices = train_cards.loc[val_mask].index.values

        print(f"  Data: Train {len(tr_cards)} cards | Val {len(val_cards)} cards")

        # dataset objects
        ds_tr = CardDataset(tr_cards[TEXT_COL].tolist(), tr_cards[LABEL_COL].values, tokenizer, MAX_LEN)
        ds_val = CardDataset(val_cards[TEXT_COL].tolist(), val_cards[LABEL_COL].values, tokenizer, MAX_LEN)

        # fresh model init for this fold
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

        # define training args
        # note: no checkpoints saved to preserve disk space for final version
        training_args = TrainingArguments(
            output_dir= OUT_DIR,
            # no checkpoint strategy
            save_strategy="no",           # dont save intermediate models
            load_best_model_at_end=False, # take model at end of last epoch
            learning_rate=BEST_LR,
            per_device_train_batch_size=BEST_BS_T,
            per_device_eval_batch_size=BEST_BS_E,
            num_train_epochs=EPOCHS,
            weight_decay=BEST_WD,
            warmup_ratio=0.10,                  # warm up lr for first 10%
            lr_scheduler_type=SCHEDULER_TYPE,
            max_grad_norm=1.0,                  # clip gradients to prevent explosions
            label_smoothing_factor=LABEL_SMOOTHING,
            logging_steps=100,
            eval_strategy="epoch",              # check metrics every epoch
            fp16=fp16_available,                # speed up on gpu
            report_to=[],                       # no wandb/tensorboard logging
            seed=SEED,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=ds_tr,
            eval_dataset=ds_val,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        # train
        trainer.train()

        # manual move to device for inference loop
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        # validation prediction (oof)
        # predict on validation chunk
        val_logits = predict_texts(
            model, tokenizer,
            val_cards[TEXT_COL].tolist(),
            batch_size=32,
            max_length=MAX_LEN
        )
        # extract proba for class 1
        val_proba = softmax(val_logits)[:, 1]

        # store in global array
        oof_proba_cards[val_original_indices] = val_proba

        # simple accuracy check
        card_acc = accuracy_score(val_cards[LABEL_COL].values, (val_proba >= 0.5).astype(int))
        print(f"  > Fold Finished. Validation Card Accuracy: {card_acc:.4f}")

        # test prediction
        # predict on full test set with this fold's model
        test_logits = predict_texts(
            model, tokenizer,
            test_texts,
            batch_size=32,
            max_length=MAX_LEN
        )
        test_proba_folds.append(softmax(test_logits)[:, 1])

        # cleanup to prevent oom on colab
        del trainer, model, ds_tr, ds_val
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # 4. aggregation (card -> user)
    print("\n4. Aggregating results (Card Level -> User Level)...")

    # a. test set: ensemble averaging across folds
    test_proba_mean = np.mean(np.stack(test_proba_folds), axis=0)

    # b. oof (train set): group cards by user
    train_cards_oof = train_cards.copy()
    train_cards_oof["oof_proba"] = oof_proba_cards

    # mean of probabilities per user
    oof_user = train_cards_oof.groupby(ID_COL).agg({
        "oof_proba": "mean",
        LABEL_COL: "first" # label is constant for user
    }).reset_index()

    # calc final user-level accuracy
    oof_preds = (oof_user["oof_proba"] >= 0.5).astype(int)
    oof_acc = accuracy_score(oof_user[LABEL_COL], oof_preds)
    print(f"\n> Final OOF accuracy (user-level): {oof_acc:.4f} <<<")

    # c. format test submission
    test_cards_proba = test_cards.copy()
    test_cards_proba["proba"] = test_proba_mean
    test_user = test_cards_proba.groupby(ID_COL)["proba"].mean().reset_index()
    test_user.columns = [ID_COL, "camembert_proba"]

    # 5. saving
    print("\n5. Saving CSV results...")
    oof_user.to_csv(OUT_OOF, index=False)
    test_user.to_csv(OUT_TEST, index=False)
    print(f"  [saved] Training OOF predictions: {OUT_OOF}")
    print(f"  [saved] Test predictions: {OUT_TEST}")

    print("\nScript completed.")

if __name__ == "__main__":
    main()

Fine-tuning CamemBERT
Hyperparameters -> LR: 2.32e-05, Batch: 48, WD: 0.019
Output Dir: /content/drive/MyDrive/Colab Notebooks/code/intermediate

1. Loading and cleaning data...
  > Train Set: 61337 cards
  > Test Set:  40894 cards

2. Preparing User-Level Cross-Validation...

3. Starting 5-Fold Cross-Validation...

--- Fold 1/5 ---
  Data: Train 49064 cards | Val 12273 cards


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at almanach/camembertav2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3206,0.349619,0.856596
2,0.2652,0.343284,0.868247


  > Fold Finished. Validation Card Accuracy: 0.8682

--- Fold 2/5 ---
  Data: Train 49073 cards | Val 12264 cards


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at almanach/camembertav2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3115,0.333523,0.864726
2,0.2719,0.337372,0.871575


  > Fold Finished. Validation Card Accuracy: 0.8716

--- Fold 3/5 ---
  Data: Train 49071 cards | Val 12266 cards


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at almanach/camembertav2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3216,0.356376,0.861406
2,0.2716,0.346454,0.868661


  > Fold Finished. Validation Card Accuracy: 0.8686

--- Fold 4/5 ---
  Data: Train 49070 cards | Val 12267 cards


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at almanach/camembertav2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.314,0.346675,0.863536
2,0.2755,0.312141,0.876416


  > Fold Finished. Validation Card Accuracy: 0.8764

--- Fold 5/5 ---
  Data: Train 49070 cards | Val 12267 cards


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at almanach/camembertav2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3219,0.317816,0.872177
2,0.2692,0.328393,0.878291


  > Fold Finished. Validation Card Accuracy: 0.8782

4. Aggregating results (Card Level -> User Level)...

>>> FINAL OOF ACCURACY (User-Level): 0.8749 <<<

5. Saving CSV results...
  [Saved] Training OOF predictions: /content/drive/MyDrive/Colab Notebooks/code/intermediate/oof_camembert.csv
  [Saved] Test predictions: /content/drive/MyDrive/Colab Notebooks/code/intermediate/test_camembert.csv

Script completed successfully.


```text
Fine-tuning CamemBERTa
Hyperparameters -> LR: 2.32e-05, Batch: 48, WD: 0.019
Output Dir: /content/drive/MyDrive/Colab Notebooks/code/intermediate

1. Loading and cleaning data...
  > Train Set: 61337 cards
  > Test Set:  40894 cards

2. Preparing User-Level Cross-Validation...

3. Starting 5-Fold Cross-Validation...

--- Fold 1/5 ---
  Data: Train 49064 cards | Val 12273 cards
  > Fold Finished. Validation Card Accuracy: 0.8682

--- Fold 2/5 ---
  Data: Train 49073 cards | Val 12264 cards
  > Fold Finished. Validation Card Accuracy: 0.8716

--- Fold 3/5 ---
  Data: Train 49071 cards | Val 12266 cards
  > Fold Finished. Validation Card Accuracy: 0.8686

--- Fold 4/5 ---
  Data: Train 49070 cards | Val 12267 cards
  > Fold Finished. Validation Card Accuracy: 0.8764

--- Fold 5/5 ---
  Data: Train 49070 cards | Val 12267 cards
  > Fold Finished. Validation Card Accuracy: 0.8782

4. Aggregating results (Card Level -> User Level)...

>>> FINAL OOF ACCURACY (User-Level): 0.8749 <<<

5. Saving CSV results...
  [Saved] Training OOF predictions: /content/drive/MyDrive/Colab Notebooks/code/intermediate/oof_camembert.csv
  [Saved] Test predictions: /content/drive/MyDrive/Colab Notebooks/code/intermediate/test_camembert.csv

Script completed successfully.
```
