# Import

In [17]:
from __future__ import annotations

import hashlib
from pathlib import Path
import os, json
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from copy import deepcopy


from datasets import Dataset
from transformers import AutoTokenizer
import evaluate
import torch
from transformers import TrainingArguments, Trainer


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
)


# Global Settings

In [18]:
# -----------------------------
# Project paths
# -----------------------------
DATA_DIR = Path("data")
DATA_PATH_train = DATA_DIR / "train_fixed.csv"
DATA_PATH_val = DATA_DIR / "test_fixed.csv"
EXPERIMENTS_DIR = Path("experiments")
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)


In [19]:
# -----------------------------
# Dataset column config
# -----------------------------

# TEXT_VARIANT_COL  = "text_english"  
# TEXT_VARIANT_COL  = "text_stripped_lowercase" 
TEXT_VARIANT_COL  = "text_stripped" 
LABEL_COL = "prdtypecode"
RANDOM_STATE = 42

In [20]:
# -----------------------------
# Configuration of tokenizer, model, training
# -----------------------------

CFG = {
    "experiment_id": datetime.now().strftime("%Y%m%d_%H%M%S"),  # unique run id
    "text_col": TEXT_VARIANT_COL,          # text column to use for this run
    "label_col": LABEL_COL,          # label column to use for this run
    
    "seed": RANDOM_STATE,                         # reproducibility
    "model_ckpt": "jhu-clsp/mmBERT-base",  # model
    
    # Tokenizer parameters
    "max_length": 256,                  # token length for padding/truncation later
    "padding": False,         
    "truncation": True,              # truncate sequences longer than max_length
    
    # imbalance handling
    "use_class_weights": True,         # whether to use class weights in the loss function
    "class_weight_method": "inv_freq",  # "inverse" or "sqrt_inv" for gentler weights
    "class_weight_eps": 1e-6,        # to avoid division by zero
    
    # model
    "eval_strategy": "epoch",   
    "save_strategy": "epoch",
    "save_total_limit": 2,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",
    "greater_is_better": True,
    "logging_steps": 100,
    "report_to": "none",
    "lr_scheduler_type": "linear",
    
    "fp16": False,
    "bf16": True,                    
    "label_smoothing_factor": 0.0,    # label smoothing factor    
    
    "warmup_ratio": 0.06,              # fraction of total steps used for warmup
    "batch_size": 32,
    "lr": 2e-5, # learning rate
    "epochs": 4,
    "weight_decay": 0.01,
    "greater_is_better": True,
    "gradient_accumulation_steps": 2,          
}


BASE_CFG = deepcopy(CFG)

# EXPERIMENTS = [
#     # -----------------------------
#     # Reference: mmBERT
#     # -----------------------------
#     {
#         "run_name": "mmBERT_lr2e-5_bs32",
#         "model_ckpt": "jhu-clsp/mmBERT-base",
#         "lr": 2e-5,
#         "batch_size": 32,
#     },

#     # -----------------------------
#     # XLM-RoBERTa
#     # -----------------------------
#     {
#         "run_name": "XLMR-base_lr2e-5_bs32",
#         "model_ckpt": "FacebookAI/xlm-roberta-base",
#         "lr": 2e-5,
#         "batch_size": 32,
#     },

#     # -----------------------------
#     # Multilingual DeBERTa v3
#     # -----------------------------
#     {
#         "run_name": "mDeBERTa-v3_lr2e-5_bs32",
#         "model_ckpt": "microsoft/mdeberta-v3-base",
#         "lr": 2e-5,
#         "batch_size": 32,
#     },

#     # -----------------------------
#     # Multilingual BERT (mBERT)
#     # -----------------------------
#     {
#         "run_name": "mBERT_lr2e-5_bs32",
#         "model_ckpt": "google-bert/bert-base-multilingual-cased",
#         "lr": 2e-5,
#         "batch_size": 32,
#     },

#     # -----------------------------
#     # Distilled multilingual BERT
#     # -----------------------------
#     {
#         "run_name": "Distil-mBERT_lr2e-5_bs32",
#         "model_ckpt": "distilbert/distilbert-base-multilingual-cased",
#         "lr": 2e-5,          
#         "batch_size": 32,    
#     },
# ]

LRS = [2e-5]
BATCH_SIZES = [16]
MAX_LENGTHS = [16,32, 64]

EXPERIMENTS = [
    {
        "run_name": f"mmBERT_lr{lr}_bs{bs}_len{ml}",
        "model_ckpt": "jhu-clsp/mmBERT-base",
        "lr": lr,
        "batch_size": bs,
        "max_length": ml,
    }
    for lr in LRS
    for bs in BATCH_SIZES
    for ml in MAX_LENGTHS
]


# Facebook AI for monolingual BERT models

# EXPERIMENTS = [
#     # -----------------------------
#     # Reference: mmBERT
#     {
#     "run_name": " mmBERT-base",
#     "model_ckpt": "jhu-clsp/mmBERT-base",
#     "lr": 2e-5,
#     "batch_size": 16,
#     "max_length": 384,
#     }
# ]

def make_cfg(base, overrides):
    cfg = deepcopy(base)
    cfg.update(overrides)
    cfg["experiment_id"] = datetime.now().strftime("%Y%m%d_%H%M%S")
    return cfg



# Loading, tokenizing, training

In [21]:
def run_experiment(CFG):

    # output directory per run
    OUT_DIR = f"experiments/{CFG['model_ckpt'].split('/')[-1]}_{CFG['experiment_id']}"
    os.makedirs(OUT_DIR, exist_ok=True)

    # Save config to json
    with open(os.path.join(OUT_DIR, "cfg.json"), "w") as f:
        json.dump(CFG, f, indent=2)

    print("OUT_DIR:", OUT_DIR)

    ##### read data frame ###
    ##############################################

    # splitting is already done
    # train df
    train_df = pd.read_csv(DATA_PATH_train)
    # val df
    val_df = pd.read_csv(DATA_PATH_val)

    #### LabelEncoder ###
    ##############################################

    # print shapes

    train_df = train_df[[CFG["text_col"], CFG["label_col"]]].astype(str)
    val_df = val_df[[CFG["text_col"], CFG["label_col"]]].astype(str)

    print("Train shape:", train_df.shape)
    print("Validation shape:", val_df.shape)

    # encoding labels
    le = LabelEncoder()
    train_df["label"] = le.fit_transform(train_df[CFG["label_col"]].astype(str))
    val_df["label"] = le.transform(val_df[CFG["label_col"]].astype(str))

    # print number of classes
    num_classes = len(le.classes_)
    print("Number of classes:", num_classes)

    # Save mappings for later interpretation / inference
    label_names = list(le.classes_)  # index = label id
    id2label = {i: label_names[i] for i in range(len(label_names))}
    label2id = {v: k for k, v in id2label.items()}


    # save
    with open(os.path.join(OUT_DIR, "label_map.json"), "w") as f:
        json.dump({"id2label": id2label, "label2id": label2id}, f, indent=2)
        
        
    ### Tokenization ###
    ##########################################


    from transformers import DataCollatorWithPadding

    train_hf = Dataset.from_pandas(
        train_df[[CFG["text_col"], "label"]].reset_index(drop=True)
    )
    val_hf = Dataset.from_pandas(
        val_df[[CFG["text_col"], "label"]].reset_index(drop=True)
    )

    print(train_hf)
    print(val_hf)


    # 2) Load a tokenizer that matches your chosen checkpoint
    #    Tokenizer turns text into token IDs the model understands.
    tokenizer = AutoTokenizer.from_pretrained(CFG["model_ckpt"], use_fast=False)

    # 3) Define how to tokenize a batch of examples
    def tokenize_batch(batch):
        out = tokenizer(
            batch[CFG["text_col"]],   # list of texts
            truncation=CFG["truncation"],          # cut off texts longer than max_length
            padding=CFG["padding"],     # pad shorter texts to max_length
            max_length=CFG["max_length"],
        )
        out.pop("token_type_ids", None)  # Deberta does not use token_type_ids
        return out

    # 4) Apply tokenization to the whole dataset
    #    remove_columns removes the raw text column after tokenization to avoid duplication
    train_tok = train_hf.map(tokenize_batch, batched=True, remove_columns=[CFG["text_col"]])
    val_tok   = val_hf.map(tokenize_batch, batched=True, remove_columns=[CFG["text_col"]])
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print(train_tok)
    print(train_tok[0].keys())  # inspect produced fields
    print("Example label:", train_tok[1]["label"])
    print("Example input_ids length:", len(train_tok[0]["input_ids"]))


    ### load label id mapping ###
    ###########################################

    with open(os.path.join(OUT_DIR, "label_map.json"), "r") as f:
        maps = json.load(f)

    id2label = {int(k): v for k, v in maps["id2label"].items()}
    label2id = {v: int(k) for v, k in maps["label2id"].items()}  # invert back to int ids

    num_labels = len(id2label)
    print("num_labels:", num_labels)
    print("example:", list(id2label.items())[:5])


    ### Class weights ###
    ##########################################


    def compute_class_weights(train_df, label_col, mode="inv_freq", eps=1e-6):
        counts = (
            train_df[label_col]
            .value_counts()
            .sort_index()
            .values.astype(np.float32)
        )

        if mode == "inv_freq":
            w = counts.sum() / (len(counts) * (counts + eps))
        elif mode == "sqrt_inv":
            w = np.sqrt(counts.sum() / (len(counts) * (counts + eps)))
        else:
            raise ValueError(f"Unknown class_weight_mode: {mode}")

        w = w / w.mean()  # normalize → mean weight = 1
        return torch.tensor(w, dtype=torch.float)

    class_weights = None
    if CFG["use_class_weights"]:
        class_weights = compute_class_weights(
            train_df,
            label_col=CFG["label_col"],
            mode=CFG["class_weight_method"],
            eps=CFG["class_weight_eps"],
        )
        print("Class weights:", class_weights)
        
        
    ### Model ###
    ###########################################

    from transformers import AutoModelForSequenceClassification

    model = AutoModelForSequenceClassification.from_pretrained(
        CFG["model_ckpt"],     # e.g. "distilbert-base-uncased"
        num_labels=num_labels,
        id2label=id2label,
        label2id={v: k for k, v in id2label.items()}  # label string -> id
    )

    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
            "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        }

    import torch.nn as nn

    class OptionalWeightedTrainer(Trainer):
        def __init__(self, class_weights=None, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.class_weights = class_weights

        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits

            if self.class_weights is not None:
                loss_fct = nn.CrossEntropyLoss(
                    weight=self.class_weights.to(logits.device)
                )
            else:
                loss_fct = nn.CrossEntropyLoss()

            loss = loss_fct(logits, labels)
            return (loss, outputs) if return_outputs else loss
        




    training_args = TrainingArguments(
        output_dir=OUT_DIR,
        seed=CFG["seed"],
        learning_rate=CFG["lr"],
        per_device_train_batch_size=CFG["batch_size"],
        per_device_eval_batch_size=CFG["batch_size"],
        num_train_epochs=CFG["epochs"],
        weight_decay=CFG["weight_decay"],

        eval_strategy=CFG["eval_strategy"],   # <-- changed name (was evaluation_strategy)
        save_strategy=CFG["save_strategy"],

        load_best_model_at_end=CFG["load_best_model_at_end"],
        metric_for_best_model=CFG["metric_for_best_model"],
        greater_is_better=CFG["greater_is_better"],

        logging_steps=CFG["logging_steps"],
        report_to=CFG["report_to"],
        
        lr_scheduler_type=CFG["lr_scheduler_type"],
        fp16=CFG["fp16"],
        bf16=CFG["bf16"],
        
        label_smoothing_factor=CFG["label_smoothing_factor"],
        gradient_accumulation_steps=CFG["gradient_accumulation_steps"],
        save_total_limit=CFG["save_total_limit"],
        warmup_ratio=CFG["warmup_ratio"],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tokenizer,          # from your tokenization step
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    TrainerCls = OptionalWeightedTrainer if CFG["use_class_weights"] else Trainer

    trainer = TrainerCls(
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        class_weights=class_weights if CFG["use_class_weights"] else None,
    )

    print("Starting training using model:", CFG["model_ckpt"])
    # print which Trainer is used
    print("Using Trainer class:", trainer.__class__.__name__)


    print("CUDA available:", torch.cuda.is_available())
    train_out = trainer.train()
    eval_out = trainer.evaluate()
    print(eval_out)

    # save eval metrics
    with open(os.path.join(OUT_DIR, "eval_metrics.json"), "w") as f:
        json.dump({k: float(v) for k, v in eval_out.items()}, f, indent=2)

    # also save train metrics (loss curves etc.) if available
    train_metrics = train_out.metrics if hasattr(train_out, "metrics") else {}
    with open(os.path.join(OUT_DIR, "train_metrics.json"), "w") as f:
        json.dump({k: float(v) for k, v in train_metrics.items()} if train_metrics else {}, f, indent=2)

    print("Saved metrics to:", OUT_DIR)

    # This saves the "best" checkpoint if load_best_model_at_end=True
    BEST_DIR = os.path.join(OUT_DIR, "best_model")

    # also save train_tok, val_tok datasets
    train_tok.save_to_disk(os.path.join(BEST_DIR, "train_dataset"))
    val_tok.save_to_disk(os.path.join(BEST_DIR, "val_dataset"))

    trainer.save_model(BEST_DIR)          # model + config
    tokenizer.save_pretrained(BEST_DIR)   # tokenizer files

    print("Saved best model to:", BEST_DIR)
    
    return {
        "eval_f1_macro": eval_out.get("eval_f1_macro"),
        "eval_loss": eval_out.get("eval_loss"),
        "eval_accuracy": eval_out.get("eval_accuracy"),
        "best_model_checkpoint": getattr(trainer.state, "best_model_checkpoint", None),
        "output_dir": CFG.get("output_dir", None),
    }


In [22]:
results = []

for exp in EXPERIMENTS:
    CFG = make_cfg(BASE_CFG, exp)
    out = run_experiment(CFG)   # you define this from your existing cells
    results.append({"run_name": CFG["run_name"], **out})

results

OUT_DIR: experiments/mmBERT-base_20260122_131611
Train shape: (66800, 2)
Validation shape: (16701, 2)
Number of classes: 27
Dataset({
    features: ['text_stripped', 'label'],
    num_rows: 66800
})
Dataset({
    features: ['text_stripped', 'label'],
    num_rows: 16701
})


Map: 100%|██████████| 66800/66800 [00:06<00:00, 10013.50 examples/s]
Map: 100%|██████████| 16701/16701 [00:01<00:00, 9435.38 examples/s]


Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 66800
})
dict_keys(['label', 'input_ids', 'attention_mask'])
Example label: 6
Example input_ids length: 16
num_labels: 27
example: [(0, '10'), (1, '1140'), (2, '1160'), (3, '1180'), (4, '1280')]
Class weights: tensor([0.6013, 0.7012, 0.4737, 2.4514, 0.3862, 0.9178, 0.3740, 2.6988, 0.7883,
        0.5810, 0.3739, 0.4583, 2.3330, 0.3853, 2.2763, 0.3933, 0.3931, 1.3302,
        0.3783, 0.7467, 0.1908, 0.7603, 0.6787, 2.1459, 0.7538, 1.1382, 2.2902])


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  super().__init__(*args, **kwargs)


Starting training using model: jhu-clsp/mmBERT-base
Using Trainer class: OptionalWeightedTrainer
CUDA available: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.4768,0.769737,0.775103,0.757182
2,0.9631,0.691882,0.802168,0.788206
3,0.3562,0.846551,0.812167,0.799938
4,0.1445,1.025134,0.816897,0.807269


{'eval_loss': 1.0251343250274658, 'eval_accuracy': 0.816897191784923, 'eval_f1_macro': 0.8072690333763385, 'eval_runtime': 20.0602, 'eval_samples_per_second': 832.542, 'eval_steps_per_second': 52.043, 'epoch': 4.0}
Saved metrics to: experiments/mmBERT-base_20260122_131611


Saving the dataset (1/1 shards): 100%|██████████| 66800/66800 [00:00<00:00, 577288.58 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16701/16701 [00:00<00:00, 380686.99 examples/s]


Saved best model to: experiments/mmBERT-base_20260122_131611/best_model
OUT_DIR: experiments/mmBERT-base_20260122_133831
Train shape: (66800, 2)
Validation shape: (16701, 2)
Number of classes: 27
Dataset({
    features: ['text_stripped', 'label'],
    num_rows: 66800
})
Dataset({
    features: ['text_stripped', 'label'],
    num_rows: 16701
})


Map: 100%|██████████| 66800/66800 [00:05<00:00, 12805.64 examples/s]
Map: 100%|██████████| 16701/16701 [00:01<00:00, 13529.28 examples/s]


Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 66800
})
dict_keys(['label', 'input_ids', 'attention_mask'])
Example label: 6
Example input_ids length: 20
num_labels: 27
example: [(0, '10'), (1, '1140'), (2, '1160'), (3, '1180'), (4, '1280')]
Class weights: tensor([0.6013, 0.7012, 0.4737, 2.4514, 0.3862, 0.9178, 0.3740, 2.6988, 0.7883,
        0.5810, 0.3739, 0.4583, 2.3330, 0.3853, 2.2763, 0.3933, 0.3931, 1.3302,
        0.3783, 0.7467, 0.1908, 0.7603, 0.6787, 2.1459, 0.7538, 1.1382, 2.2902])


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  super().__init__(*args, **kwargs)


Starting training using model: jhu-clsp/mmBERT-base
Using Trainer class: OptionalWeightedTrainer
CUDA available: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.1753,0.560622,0.829411,0.816945
2,0.671,0.505088,0.856775,0.844843
3,0.1967,0.676197,0.868032,0.857457
4,0.053,0.796513,0.872103,0.863108


{'eval_loss': 0.7965131998062134, 'eval_accuracy': 0.872103466858272, 'eval_f1_macro': 0.8631084257571701, 'eval_runtime': 19.4692, 'eval_samples_per_second': 857.815, 'eval_steps_per_second': 53.623, 'epoch': 4.0}
Saved metrics to: experiments/mmBERT-base_20260122_133831


Saving the dataset (1/1 shards): 100%|██████████| 66800/66800 [00:00<00:00, 439548.98 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16701/16701 [00:00<00:00, 402137.13 examples/s]


Saved best model to: experiments/mmBERT-base_20260122_133831/best_model
OUT_DIR: experiments/mmBERT-base_20260122_135823
Train shape: (66800, 2)
Validation shape: (16701, 2)
Number of classes: 27
Dataset({
    features: ['text_stripped', 'label'],
    num_rows: 66800
})
Dataset({
    features: ['text_stripped', 'label'],
    num_rows: 16701
})


Map: 100%|██████████| 66800/66800 [00:05<00:00, 13132.97 examples/s]
Map: 100%|██████████| 16701/16701 [00:01<00:00, 13174.45 examples/s]


Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 66800
})
dict_keys(['label', 'input_ids', 'attention_mask'])
Example label: 6
Example input_ids length: 20
num_labels: 27
example: [(0, '10'), (1, '1140'), (2, '1160'), (3, '1180'), (4, '1280')]
Class weights: tensor([0.6013, 0.7012, 0.4737, 2.4514, 0.3862, 0.9178, 0.3740, 2.6988, 0.7883,
        0.5810, 0.3739, 0.4583, 2.3330, 0.3853, 2.2763, 0.3933, 0.3931, 1.3302,
        0.3783, 0.7467, 0.1908, 0.7603, 0.6787, 2.1459, 0.7538, 1.1382, 2.2902])


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  super().__init__(*args, **kwargs)


Starting training using model: jhu-clsp/mmBERT-base
Using Trainer class: OptionalWeightedTrainer
CUDA available: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.0574,0.490745,0.851745,0.840072
2,0.6213,0.445255,0.877073,0.866489
3,0.1784,0.636092,0.881624,0.871982
4,0.0223,0.739875,0.887552,0.87724


{'eval_loss': 0.7398750185966492, 'eval_accuracy': 0.8875516436141548, 'eval_f1_macro': 0.8772400411732588, 'eval_runtime': 19.9769, 'eval_samples_per_second': 836.016, 'eval_steps_per_second': 52.26, 'epoch': 4.0}
Saved metrics to: experiments/mmBERT-base_20260122_135823


Saving the dataset (1/1 shards): 100%|██████████| 66800/66800 [00:00<00:00, 307706.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16701/16701 [00:00<00:00, 530928.18 examples/s]


Saved best model to: experiments/mmBERT-base_20260122_135823/best_model


[{'run_name': 'mmBERT_lr2e-05_bs16_len16',
  'eval_f1_macro': 0.8072690333763385,
  'eval_loss': 1.0251343250274658,
  'eval_accuracy': 0.816897191784923,
  'best_model_checkpoint': 'experiments/mmBERT-base_20260122_131611/checkpoint-8352',
  'output_dir': None},
 {'run_name': 'mmBERT_lr2e-05_bs16_len32',
  'eval_f1_macro': 0.8631084257571701,
  'eval_loss': 0.7965131998062134,
  'eval_accuracy': 0.872103466858272,
  'best_model_checkpoint': 'experiments/mmBERT-base_20260122_133831/checkpoint-8352',
  'output_dir': None},
 {'run_name': 'mmBERT_lr2e-05_bs16_len64',
  'eval_f1_macro': 0.8772400411732588,
  'eval_loss': 0.7398750185966492,
  'eval_accuracy': 0.8875516436141548,
  'best_model_checkpoint': 'experiments/mmBERT-base_20260122_135823/checkpoint-8352',
  'output_dir': None}]