# üì¶ Importing Packages

In [None]:
import torch

import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import numpy as np
import pandas as pd
from collections import defaultdict
import scipy.stats as stats

from datetime import datetime
from tqdm.auto import tqdm
import shutil
import emoji
import re
import os

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)

# ‚öôÔ∏è Global Settings

In [None]:
if os.path.exists("/kaggle"):
    # Clean up the entire /kaggle/working directory
    shutil.rmtree("/kaggle/working", ignore_errors=True)
    os.makedirs("/kaggle/working", exist_ok=True)

## Hyper-Parameters

In [None]:
DATASET_TYPE = "toxicity"

MODEL_CHECKPOINT = "dbmdz/bert-base-italian-cased"
WITH_USER_TYPES = True

MULTICLASS_TARGET_NAMES = ['Toxic', 'Neutral', 'Healthy']
BINARY_TARGET_NAMES = ['Toxic', 'Healthy']

NUM_WORKERS = 2

SAVE_TOTAL_LIMIT = 1
N_FOLDS = 5
TEST_SIZE = 0.2
BATCH_SIZE = 32
NUM_EPOCHS = 20
GRADIENT_ACCUMULATION_STEPS = 4
EARLY_STOPPING_PATIENCE = 4
# EARLY_STOPPING_THRESHOLD = 0.0005

LR_SCHEDULER_KWARGS = {
    "factor": 0.5,        # Riduce il learning rate del 50% quando non migliora
    "patience": 2,
    # "threshold": EARLY_STOPPING_THRESHOLD,
    "mode": "max"
}

WARMUP_PERCENTAGE = 0.1
WEIGHT_DECAY = 0.01 # int the 0 to 0.1 range
BODY_LR = 3e-5

COST_MAT = np.array([
    [0, 8, 16],
    [8, 0, 1],
    [16, 4, 0]
])

In [None]:
import random

GLOBAL_SEED = 42
def set_seed(seed):
    """Set seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        # The following two lines are for deterministic results on CUDA.
        # They can have a performance impact.
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(GLOBAL_SEED)

## Paths Settings

In [None]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
MODEL_NAME = MODEL_CHECKPOINT.replace('/', '-')

if os.path.exists("/kaggle"):
    # ==== KAGGLE SETTINGS ====
    PATH = os.path.join(os.sep, "kaggle", "input", f"cipv-chats-{DATASET_TYPE}", f"cipv-chats-multiclass-{DATASET_TYPE}.parquet")
    OUT_DIR = os.path.join(os.sep, "kaggle", "working", f"{timestamp}-{MODEL_NAME}")
else:
    # ==== LOCAL SETTINGS ====
    PATH = os.path.join(".", "out", "datasets", f"cipv-chats-multiclass-{DATASET_TYPE}.parquet")
    OUT_DIR = os.path.join(".", 'out', 'models', DATASET_TYPE, 'messages-regression-explanation', f'{timestamp}-{MODEL_NAME}-user_types_{WITH_USER_TYPES}')

RESULTS_PATH = os.path.join(OUT_DIR, "results")
os.makedirs(RESULTS_PATH, exist_ok=True)

# üõ†Ô∏è Utility Functions

In [None]:
# Use the same color palette as confusion matrix
colors = sns.color_palette("ch:s=-.2,r=.6", n_colors=5)

In [None]:
def plot_aggregated_curves(log_histories, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    
    train_losses_by_epoch = defaultdict(list)
    eval_losses_by_epoch = defaultdict(list)
    eval_costs_by_epoch = defaultdict(list)

    for history in log_histories:
        for log in history:
            epoch = log.get('epoch')
            if epoch is None:
                continue

            # Round epoch to handle potential float values like 1.0, 2.0
            epoch = int(round(epoch))
            
            if 'loss' in log:
                train_losses_by_epoch[epoch].append(log['loss'])
            if 'eval_loss' in log:
                eval_losses_by_epoch[epoch].append(log['eval_loss'])
            if 'eval_cost' in log:
                eval_costs_by_epoch[epoch].append(log['eval_cost'])

    # --- Plotting Aggregated Loss Curve ---
    epochs = sorted(eval_losses_by_epoch.keys())
    
    mean_train_loss = [np.mean(train_losses_by_epoch[e]) for e in epochs if e in train_losses_by_epoch]
    std_train_loss = [np.std(train_losses_by_epoch[e]) for e in epochs if e in train_losses_by_epoch]
    
    mean_eval_loss = [np.mean(eval_losses_by_epoch[e]) for e in epochs]
    std_eval_loss = [np.std(eval_losses_by_epoch[e]) for e in epochs]
    
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, mean_eval_loss, 'c-o', label='Mean Eval Loss')
    plt.fill_between(
        epochs,
        np.array(mean_eval_loss) - np.array(std_eval_loss),
        np.array(mean_eval_loss) + np.array(std_eval_loss),
        color='c', alpha=0.2
    )

    # Ensure train epochs align with eval epochs for plotting
    train_epochs_for_plot = [e for e in epochs if e in train_losses_by_epoch]
    plt.plot(train_epochs_for_plot, mean_train_loss, 'g-o', label='Mean Train Loss')
    plt.fill_between(
        train_epochs_for_plot,
        np.array(mean_train_loss) - np.array(std_train_loss),
        np.array(mean_train_loss) + np.array(std_train_loss),
        color='g', alpha=0.2
    )

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Aggregated Learning Curve (Mean ¬± Std)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "aggregated_learning_curve.png"))
    plt.show()
    plt.close()
    
    # --- Plotting Aggregated Cost Curve ---
    epochs = sorted(eval_costs_by_epoch.keys())
    mean_eval_cost = [np.mean(eval_costs_by_epoch[e]) for e in epochs]
    std_eval_cost = [np.std(eval_costs_by_epoch[e]) for e in epochs]
    
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, mean_eval_cost, 'r-o', label='Mean Eval Cost')
    plt.fill_between(
        epochs,
        np.array(mean_eval_cost) - np.array(std_eval_cost),
        np.array(mean_eval_cost) + np.array(std_eval_cost),
        color='r', alpha=0.2
    )

    min_cost_epoch_idx = np.argmin(mean_eval_cost)
    min_cost_epoch = epochs[min_cost_epoch_idx]
    min_cost_value = mean_eval_cost[min_cost_epoch_idx]
    
    plt.axvline(
        x=min_cost_epoch, color='green', linestyle='--', alpha=0.7,
        label=f'Min Mean Cost at Epoch {min_cost_epoch}'
    )

    plt.xlabel('Epochs')
    plt.ylabel('Cost')
    plt.title('Aggregated Evaluation Cost (Mean ¬± Std)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "aggregated_cost_curve.png"))
    plt.show()
    plt.close()

# ü™Ñ Dataset Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
cls_token_id = tokenizer.cls_token_id

def get_formatted_msg(msg):
    return emoji.demojize(f"{msg['user']}:\n{msg['content']}", language='it')

def preprocess(messages):
    all_unique_names = list(set(msg['user'] for msg in messages))
    samples = []

    for target_idx, target_msg in enumerate(messages):
        tokenized_input = {
            'input_ids': [cls_token_id],
            'attention_mask': [1],
            'token_type_ids': [1],
            'user_type_ids': [all_unique_names.index(messages[target_idx]['user'])],
        }
        for idx, msg in enumerate(messages):
            if idx == target_idx:
                tokenized_msg = tokenizer(
                    "[SEP]" + get_formatted_msg(messages[target_idx]) + "[SEP]\n",
                    add_special_tokens=False
                )
                tokenized_msg['token_type_ids'] = [1] * len(tokenized_msg['input_ids'])
            else:
                tokenized_msg = tokenizer(
                    get_formatted_msg(messages[idx]) + "\n",
                    add_special_tokens=False
                )
                tokenized_msg['token_type_ids'] = [0] * len(tokenized_msg['input_ids'])
            tokenized_input['input_ids'] += tokenized_msg['input_ids']
            tokenized_input['attention_mask'] += tokenized_msg['attention_mask']
            tokenized_input['token_type_ids'] += tokenized_msg['token_type_ids']
            if WITH_USER_TYPES:
                tokenized_input['user_type_ids'] += [all_unique_names.index(msg['user'])] * len(tokenized_msg['input_ids'])

        tokenized_input['value'] = float(target_msg['value'])
        tokenized_input['user_id'] = all_unique_names.index(target_msg['user'])
        samples.append(tokenized_input)

    return samples


In [None]:
df = pd.read_parquet(PATH)
print(df.info())
df['messages'] = df['messages'].apply(lambda x: preprocess(x))
df = df.explode('messages')
df['input_ids'] = df['messages'].apply(lambda x: x['input_ids'])
df['attention_mask'] = df['messages'].apply(lambda x: x['attention_mask'])
df['token_type_ids'] = df['messages'].apply(lambda x: x['token_type_ids'])
if WITH_USER_TYPES:
    df['user_type_ids'] = df['messages'].apply(lambda x: x['user_type_ids'])
df['labels'] = df['messages'].apply(lambda x: x['value'])
df['user_ids'] = df['messages'].apply(lambda x: x['user_id'])

# Drop the original messages column
df = df.drop(columns=['messages'])
df = df.reset_index() # drop=True
df = df.rename(columns={'index': 'chat_ids'})

dataset = Dataset.from_pandas(df)
print(df.info())
print(df.head())

In [None]:
def print_dataset_info(dataset):
    print(dataset)
    # For each field, print the first entry
    for field in dataset.features:
        print(f"{field}: {dataset[1][field]}\n")

print_dataset_info(dataset)

In [None]:
# limit the dataset to only 2 unique couples
# unique_couples = list(set(tokenized_dataset['couple_ids']))
# selected_couples = unique_couples[:5]  # Take first 2 couples
# print(f"Selected couples: {selected_couples}")

# # Filter dataset to only include samples from selected couples
# filtered_indices = [i for i, couple_id in enumerate(tokenized_dataset['couple_ids']) 
#                    if couple_id in selected_couples]
# tokenized_dataset = tokenized_dataset.select(filtered_indices)

# print(f"Dataset size after filtering to 2 couples: {len(tokenized_dataset)}")
# print_dataset_info(tokenized_dataset)

# ü§ñ Fine-Tuning BERT

## üìà‚Äã Cross-Validation Training-Evaluation

In [None]:
def confidence_interval(scores, confidence_level=0.95):
    """
    Computes the confidence interval for a given performance metric.

    This function is useful for understanding the reliability of a single model's 
    mean performance score from cross-validation.

    Args:
        scores (list or np.ndarray): A list of scores from cross-validation folds.
        confidence_level (float): The desired confidence level (e.g., 0.95 for 95%).

    Returns:
        tuple: A tuple containing the mean score, and the lower and upper bounds 
               of the confidence interval (mean, lower_bound, upper_bound).
    """
    n = len(scores)
    if n <= 1:
        # Cannot compute CI for 1 or 0 scores, return mean and NaN for bounds
        return (np.mean(scores), np.nan, np.nan)
        
    mean_score = np.mean(scores)
    # Standard Error of the Mean (SEM) = Sn / sqrt(n)
    # where Sn is the standard deviation of the scores
    std_err = stats.sem(scores)
    
    # Degrees of freedom
    dof = n - 1
    
    # Get the critical value from the t-distribution
    t_critical = stats.t.ppf((1 + confidence_level) / 2., dof)
    
    margin_of_error = t_critical * std_err
    
    lower_bound = mean_score - margin_of_error
    upper_bound = mean_score + margin_of_error
    
    return (mean_score, lower_bound, upper_bound)

def print_and_save_classification_report_conf_intervals(cv_results, save_path, label_names, confidence=0.95, name="classification_report_with_cv.txt"):
    with open(os.path.join(save_path, name), "w", encoding="utf-8") as f:
        f.write(f"=== Cross-Validation Results (Mean ¬± Std [{confidence * 100:.0f}% CI]) ===\n\n")
        print(f"=== Cross-Validation Results (Mean ¬± Std [{confidence * 100:.0f}% CI]) ===\n")

        # Create the classification report format
        report_lines = []
        
        # Header
        header = f"{'':>14} {'precision':>27} {'recall':>27} {'f1-score':>27}"
        report_lines.append(header)
        report_lines.append("")
        
        # Per-class metrics
        for i, name in enumerate(label_names):
            name_lower = name.lower()
            
            # Calculate confidence intervals for each metric
            precision_scores = cv_results[f'test_precision_{name_lower}']
            precision_mean, precision_lower, precision_upper = confidence_interval(precision_scores, confidence)
            precision_std = np.std(precision_scores)

            recall_scores = cv_results[f'test_recall_{name_lower}']
            recall_mean, recall_lower, recall_upper = confidence_interval(recall_scores, confidence)
            recall_std = np.std(recall_scores)

            f1_scores = cv_results[f'test_f1_{name_lower}']
            f1_mean, f1_lower, f1_upper = confidence_interval(f1_scores, confidence)
            f1_std = np.std(f1_scores)

            # Format with confidence intervals
            precision_ci = f"{precision_mean:.2f} ¬± {precision_std:.2f} [{precision_lower:.2f}, {precision_upper:.2f}]"
            recall_ci = f"{recall_mean:.2f} ¬± {recall_std:.2f} [{recall_lower:.2f}, {recall_upper:.2f}]"
            f1_ci = f"{f1_mean:.2f} ¬± {f1_std:.2f} [{f1_lower:.2f}, {f1_upper:.2f}]"

            line = f"{name:>14} {precision_ci:>27} {recall_ci:>27} {f1_ci:>27}"
            report_lines.append(line)
        
        report_lines.append("")

        # Accuracy
        accuracy_scores = cv_results['test_accuracy']
        accuracy_mean, accuracy_lower, accuracy_upper = confidence_interval(accuracy_scores, confidence)
        accuracy_std = np.std(accuracy_scores)
        accuracy_ci = f"{accuracy_mean:.2f} ¬± {accuracy_std:.2f} [{accuracy_lower:.2f}, {accuracy_upper:.2f}]"
        line = f"{'accuracy':>14} {'':>27} {'':>27} {accuracy_ci:>27}"
        report_lines.append(line)
        
        # Macro and weighted averages
        for avg_type in ['macro', 'weighted']:
            precision_scores = cv_results[f'test_precision_{avg_type}']
            precision_mean, precision_lower, precision_upper = confidence_interval(precision_scores, confidence)
            precision_std = np.std(precision_scores)

            recall_scores = cv_results[f'test_recall_{avg_type}']
            recall_mean, recall_lower, recall_upper = confidence_interval(recall_scores, confidence)
            recall_std = np.std(recall_scores)

            f1_scores = cv_results[f'test_f1_{avg_type}']
            f1_mean, f1_lower, f1_upper = confidence_interval(f1_scores, confidence)
            f1_std = np.std(f1_scores)

            # Format with confidence intervals
            precision_ci = f"{precision_mean:.2f} ¬± {precision_std:.2f} [{precision_lower:.2f}, {precision_upper:.2f}]"
            recall_ci = f"{recall_mean:.2f} ¬± {recall_std:.2f} [{recall_lower:.2f}, {recall_upper:.2f}]"
            f1_ci = f"{f1_mean:.2f} ¬± {f1_std:.2f} [{f1_lower:.2f}, {f1_upper:.2f}]"

            line = f"{avg_type + ' avg':>14} {precision_ci:>27} {recall_ci:>27} {f1_ci:>27}"
            report_lines.append(line)
        
        if 'cost' in cv_results:
            cost_scores = cv_results['test_cost']
            cost_mean, cost_lower, cost_upper = confidence_interval(cost_scores, confidence)
            cost_std = np.std(cost_scores)
            cost_ci = f"{cost_mean:.2f} ¬± {cost_std:.2f} [{cost_lower:.2f}, {cost_upper:.2f}]"
            report_lines.append("")
            report_lines.append(f"Total Cost: {cost_ci}")
        
        # Write to file and print
        report_text = "\n".join(report_lines)
        f.write(report_text)
        print(report_text)

def plot_confusion_matrices(cms, classes, path=None):
    """
    Plots a single confusion matrix showing mean ¬± standard deviation for each cell.
    
    Args:
        cv_results: Cross-validation results containing confusion_matrix scores
        classes: List of class names
        path: Path to save the plot
    """
    n_classes = len(classes)

    # Calculate mean and std for each cell
    cm_mean = np.mean(cms, axis=0)
    cm_std = np.std(cms, axis=0)
    
    # Create the plot
    plt.figure(figsize=(7, 5))
    
    # Create annotations with mean ¬± std format
    annotations = np.empty_like(cm_mean, dtype=object)
    for i in range(n_classes):
        for j in range(n_classes):
            annotations[i, j] = f'{cm_mean[i, j]:.1f} ¬± {cm_std[i, j]:.2f}'
    
    sns.heatmap(
        cm_mean, 
        annot=annotations, 
        fmt='', 
        cmap=sns.color_palette("ch:s=-.2,r=.6", as_cmap=True),
        xticklabels=classes, 
        yticklabels=classes,
        cbar_kws={'label': 'Mean Count'}
    )
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title('Confusion Matrix (Mean ¬± Std)')
    plt.tight_layout()
    
    if path:
        plt.savefig(path, dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()


In [None]:
def calculate_total_cost(y_true, y_pred, cost_mat):
    """
    Calculates the total cost of predictions using a cost matrix.
    
    Args:
        y_true: True labels
        y_pred: Predicted labels  
        cost_mat: Cost matrix where cost_mat[i,j] is the cost of 
                 predicting class j when true class is i
    
    Returns:
        Total cost (scalar)
    """
    # Generate labels that match cost matrix dimensions
    # Assumes labels are 0, 1, 2, ..., num_classes-1
    num_classes = cost_mat.shape[0]
    labels = np.arange(num_classes)
    
    # Get confusion matrix with all possible labels
    conf_mat = confusion_matrix(y_true, y_pred, labels=labels)
    
    # Calculate total cost
    return np.sum(conf_mat * cost_mat)

def multiclass_clf(values):
    """
    Classify regression values into categories:
    - toxic: [-1, -0.35]
    - neutral: (-0.35, 0.35)
    - healthy: [0.35, 1]
    """
    classified = np.where(values <= -0.35, 0,  # toxic
                    np.where(values < 0.35, 1,  # neutral
                            2))  # healthy
    return classified

def binary_clf(values):
    """
    Classify regression values into categories:
    - toxic: [-1, -0.35)
    - healthy: [-0.35, 1]
    """
    classified = np.where(values < -0.35, 0, 1)
    return classified

def training_compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Flatten predictions and labels if they're multi-dimensional
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    # Classify predictions and ground truth
    pred_classes = multiclass_clf(predictions)
    true_classes = multiclass_clf(labels)

    cost = calculate_total_cost(true_classes, pred_classes, COST_MAT)

    # Extract metrics for logging
    metrics = {
        'cost': cost
    }
    
    return metrics

In [None]:
def get_trainer(tokenized_train_set, tokenized_eval_set, out_dir):

    if WITH_USER_TYPES:
        data_collator = DataCollatorWithUserTypePadding(tokenizer=tokenizer)
        config = BertConfig.from_pretrained(
            MODEL_CHECKPOINT,
            num_labels=1, # our task is regression (num_labels=1).
        )
        config.user_type_vocab_size = 2
        model = BertWithUserTypeForSequenceClassification.from_pretrained(
            MODEL_CHECKPOINT,
            config=config
        )
    else:
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        model = BertForSequenceClassification.from_pretrained(
            MODEL_CHECKPOINT,
            num_labels=1,
        )

    training_args = TrainingArguments(
        output_dir=out_dir,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        
        eval_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=SAVE_TOTAL_LIMIT,
        save_strategy="epoch",
        logging_strategy="epoch",
        metric_for_best_model="cost",
        greater_is_better=False,
        report_to="none",

        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=BODY_LR,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_PERCENTAGE,

        # lr_scheduler_type="linear",
        lr_scheduler_type="reduce_lr_on_plateau",
        lr_scheduler_kwargs=LR_SCHEDULER_KWARGS,

        fp16=torch.cuda.is_available(),
        dataloader_num_workers=NUM_WORKERS,
        # remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_set, # .remove_columns(["chat_ids", "couple_ids"])
        eval_dataset=tokenized_eval_set, # .remove_columns(["chat_ids", "couple_ids"])
        # tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=training_compute_metrics,
        callbacks=[EarlyStoppingCallback(
            early_stopping_patience=EARLY_STOPPING_PATIENCE,
            # early_stopping_threshold=EARLY_STOPPING_THRESHOLD
        )]
    )

    return trainer

In [None]:
def compute_per_message_report(predictions, labels, clf, target_names, cost_mat=None):
    
    # Flatten predictions and labels if they're multi-dimensional
    # predictions = predictions.flatten()
    # labels = labels.flatten()
    
    pred_classes = clf(predictions)
    true_classes = clf(labels)

    per_msg_report = classification_report(
        true_classes, pred_classes, 
        target_names=target_names,
        output_dict=True,
        zero_division=0
    )
    
    if cost_mat is not None:
        per_msg_report['cost'] = calculate_total_cost(true_classes, pred_classes, cost_mat)

    conf_matrix = confusion_matrix(true_classes, pred_classes)

    return per_msg_report, conf_matrix

In [None]:
def classify_chat(df, clf):
    # Group by chat_ids and user_ids, then calculate the mean value for each group.
    user_means = df.groupby(['chat_ids', 'user_ids'])['values'].mean()
    # For each chat, find the minimum of the user mean values.
    chat_mins = user_means.groupby('chat_ids').min()
    # The results are already sorted by chat_ids, so we can convert to a numpy array.
    classified = chat_mins.to_numpy()
    return clf(classified)

def compute_per_chat_report(df, clf, target_names, cost_mat=None):    
    # Classify predictions and ground truth
    pred_classes = classify_chat(df.drop(columns=['labels']).rename(columns={'predictions': 'values'}), clf)
    true_classes = classify_chat(df.drop(columns=['predictions']).rename(columns={'labels': 'values'}), clf)

    chats_report = classification_report(
        true_classes, 
        pred_classes, 
        target_names=target_names,
        output_dict=True,
        zero_division=0
    )
    if cost_mat is not None:
        chats_report['cost'] = calculate_total_cost(true_classes, pred_classes, cost_mat)
    
    conf_matrix = confusion_matrix(true_classes, pred_classes)

    return chats_report, conf_matrix

In [None]:
def report_dict_to_dataframe_dict(report, prefix=""):
    df_dict = {}
    for key, value in report.items():
        if isinstance(value, dict):
            key = key.split(' ')[0]
            for sub_key, sub_value in value.items():
                df_dict[f"{prefix}{sub_key}_{key}".lower().replace('-score', '')] = sub_value
        else:
            df_dict[f"{prefix}{key}".lower().replace('-score', '')] = value
    return df_dict

def compute_clf_metrics(predictions, fold_test_dataset, clf, target_names, prefix="", cost_mat=None):
    per_msg_report, per_msg_conf_matrix = compute_per_message_report(
        predictions.predictions,
        fold_test_dataset['labels'],
        clf,
        target_names,
        cost_mat=cost_mat
    )
    per_chat_report, per_chat_conf_matrix = compute_per_chat_report(
        pd.DataFrame({
            'chat_ids': fold_test_dataset['chat_ids'],
            'user_ids': fold_test_dataset['user_ids'],
            'labels': fold_test_dataset['labels'],
            'predictions': predictions.predictions.flatten()
        }),
        clf,
        target_names,
        cost_mat=cost_mat
    )

    metrics = report_dict_to_dataframe_dict(per_msg_report, prefix=f"{prefix}per_msg_")
    metrics.update(report_dict_to_dataframe_dict(per_chat_report, prefix=f"{prefix}per_chat_"))
    metrics[f'{prefix}per_msg_conf_matrix'] = per_msg_conf_matrix
    metrics[f'{prefix}per_chat_conf_matrix'] = per_chat_conf_matrix
    return metrics

def evaluate(predictions, fold_test_dataset):

    metrics = compute_clf_metrics(
        predictions, 
        fold_test_dataset, 
        multiclass_clf, 
        MULTICLASS_TARGET_NAMES, 
        "multiclass_", 
        cost_mat=COST_MAT
    )
    metrics.update(compute_clf_metrics(
        predictions, 
        fold_test_dataset, 
        binary_clf, 
        BINARY_TARGET_NAMES, 
        "binary_"
    ))

    labels_np = fold_test_dataset['labels'].cpu().numpy()  # Convert tensor to numpy

    # compute mse, mae, rmse and correlation coefficient between predictions and labels
    metrics['mse'] = np.mean((predictions.predictions.flatten() - labels_np) ** 2)
    metrics['mae'] = np.mean(np.abs(predictions.predictions.flatten() - labels_np))
    metrics['rmse'] = np.sqrt(metrics['mse'])
    metrics['corr_coef'] = np.corrcoef(predictions.predictions.flatten(), labels_np)[0, 1]

    baseline_clf = np.mean(labels_np)

    # compute relative squared error (RSE) and relative absolute error (RAE)
    metrics['rse'] = np.sum((predictions.predictions.flatten() - labels_np) ** 2) / np.sum((labels_np - baseline_clf) ** 2)
    metrics['rae'] = np.sum(np.abs(predictions.predictions.flatten() - labels_np)) / np.sum(np.abs(labels_np - baseline_clf))
    metrics['rrmse'] = np.sqrt(metrics['rse'])

    return metrics

def do_cross_validation(outer_cv, tokenized_dataset):

    eval_metrics = []
    all_log_histories = []

    df = tokenized_dataset.to_pandas()
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(
        X=df[['input_ids', 'token_type_ids', 'attention_mask']],
        y=df['labels'],
        groups=df['couple_ids']
    )):
        print(f"Starting fold {fold + 1}/{outer_cv.get_n_splits()}")

        # Subset the datasets for the current fold
        fold_train_dataset = tokenized_dataset.select(train_idx)
        fold_test_dataset = tokenized_dataset.select(test_idx)

        # Further split fold_train_dataset into training and validation sets
        gss_val = GroupShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=GLOBAL_SEED)
        train_idx, eval_idx = next(gss_val.split(
            X=fold_train_dataset,
            y=fold_train_dataset['labels'],
            groups=fold_train_dataset['couple_ids']
        ))
        fold_eval_dataset = fold_train_dataset.select(eval_idx)
        fold_train_dataset = fold_train_dataset.select(train_idx)

        # remove all couple_ids columns
        fold_train_dataset = fold_train_dataset.remove_columns(['couple_ids', "chat_ids", "user_ids"])
        fold_eval_dataset = fold_eval_dataset.remove_columns(['couple_ids', "chat_ids", "user_ids"])
        fold_test_dataset = fold_test_dataset.remove_columns(['couple_ids'])

        # Set the format to PyTorch tensors
        fold_train_dataset.set_format("torch")
        fold_eval_dataset.set_format("torch")
        fold_test_dataset.set_format("torch")

        # Define a unique output directory for this fold
        fold_output_dir = os.path.join(OUT_DIR, f"fold_{fold+1}")
        trainer = get_trainer(fold_train_dataset, fold_eval_dataset, fold_output_dir)
        trainer.train()

        # Evaluate on the test set
        predictions = trainer.predict(fold_test_dataset.remove_columns(["chat_ids", "user_ids"]))
        eval_metrics.append(evaluate(predictions, fold_test_dataset))

        all_log_histories.append(trainer.state.log_history)

        print(f"Cleaning up checkpoint directory: {fold_output_dir}")
        shutil.rmtree(fold_output_dir, ignore_errors=True)
    
    return pd.DataFrame(eval_metrics), all_log_histories

In [None]:
os.makedirs(RESULTS_PATH, exist_ok=True)

outer_cv = GroupKFold(n_splits=N_FOLDS) # , shuffle=True, random_state=GLOBAL_SEED
cv_results, all_log_histories = do_cross_validation(outer_cv, tokenized_dataset)

In [None]:
def render_cv_results(cv_results, target_names, out_dir):
    cv_results.drop(columns=['per_msg_conf_matrix', 'per_chat_conf_matrix']).to_csv(os.path.join(out_dir, "cv_results.csv"), index=False)
    per_msg_cv_results = cv_results.filter(like="per_msg_")
    per_msg_cv_results.columns = [re.sub(r'^per_msg_', 'test_', col) for col in per_msg_cv_results.columns]
    print_and_save_classification_report_conf_intervals(
        per_msg_cv_results.drop(columns=['test_conf_matrix']),
        out_dir, 
        target_names, 
        name="per_message_classification_report_with_cv.txt"
    )

    per_chat_cv_results = cv_results.filter(like="per_chat_")
    per_chat_cv_results.columns = [re.sub(r'^per_chat_', 'test_', col) for col in per_chat_cv_results.columns]
    print_and_save_classification_report_conf_intervals(
        per_chat_cv_results.drop(columns=['test_conf_matrix']),
        out_dir, 
        target_names, 
        name="per_chat_classification_report_with_cv.txt"
    )
    plot_confusion_matrices(
        per_msg_cv_results['test_conf_matrix'].tolist(),
        target_names,
        path=os.path.join(out_dir, "per_msg_confusion_matrix_cv.png")
    )
    plot_confusion_matrices(
        per_chat_cv_results['test_conf_matrix'].tolist(),
        target_names,
        path=os.path.join(out_dir, "per_chat_confusion_matrix_cv.png")
    )

In [None]:
multiclass_cv_results = cv_results.filter(like='multiclass')
multiclass_cv_results.columns = [re.sub(r'^multiclass_', '', col) for col in multiclass_cv_results.columns]
os.makedirs(os.path.join(RESULTS_PATH, 'multiclass'), exist_ok=True)
render_cv_results(
    multiclass_cv_results,
    MULTICLASS_TARGET_NAMES, 
    os.path.join(RESULTS_PATH, 'multiclass')
)

binary_cv_results = cv_results.filter(like='binary')
binary_cv_results.columns = [re.sub(r'^binary_', '', col) for col in binary_cv_results.columns]
os.makedirs(os.path.join(RESULTS_PATH, 'binary'), exist_ok=True)
render_cv_results(
    binary_cv_results,
    BINARY_TARGET_NAMES,
    os.path.join(RESULTS_PATH, 'binary')
)

plot_aggregated_curves(all_log_histories, RESULTS_PATH)

with open(os.path.join(RESULTS_PATH, "regression_metrics.txt"), "w", encoding="utf-8") as f:
    f.write("=== Regression Metrics ===\n\n")
    print("=== Regression Metrics ===\n")
    for metric in ['mse', 'mae', 'rmse', 'corr_coef', 'rse', 'rae', 'rrmse']:
        mean_val, lower, upper = confidence_interval(cv_results[metric], confidence_level=0.95)
        std_val = cv_results[metric].std()
        f.write(f"{metric}: {mean_val:.4f} ¬± {std_val:.4f} [{lower:.4f}, {upper:.4f}]\n")
        print(f"{metric}: {mean_val:.4f} ¬± {std_val:.4f} [{lower:.4f}, {upper:.4f}]")

## ü§ñ Production BERT Model

In [None]:
def choose_best_nepochs(all_log_histories):
    
    eval_costs_by_epoch = defaultdict(list)
    for history in all_log_histories:
        for log in history:
            if 'eval_cost' in log and log.get('epoch') is not None:
                epoch = int(round(log['epoch']))
                eval_costs_by_epoch[epoch].append(log['eval_cost'])
        
    epochs = sorted(eval_costs_by_epoch.keys())
    mean_eval_cost = [np.mean(eval_costs_by_epoch[e]) for e in epochs]
    optimal_epochs = epochs[np.argmin(mean_eval_cost)]
    
    print(f"Optimal number of epochs (min cost): {optimal_epochs}")
    print(f"Mean cost at optimal epoch: {min(mean_eval_cost):.4f}")
    
    return optimal_epochs

In [None]:
# 1. Determine Optimal Number of Epochs
optimal_epochs = choose_best_nepochs(all_log_histories)

# 2. Prepare Full Dataset
final_train_dataset = tokenized_dataset.remove_columns(['couple_ids', "chat_ids", "user_ids"])
final_train_dataset.set_format("torch")

# 3. Configure Trainer for Final Run
FINAL_MODEL_DIR = os.path.join(OUT_DIR, "final_production_model")
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)

if WITH_USER_TYPES:
    data_collator = DataCollatorWithUserTypePadding(tokenizer=tokenizer)
    config = BertConfig.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=1, # our task is regression (num_labels=1).
    )
    config.user_type_vocab_size = 2
    final_model = BertWithUserTypeForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        config=config
    )
else:
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    final_model = BertForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=1,
    )

final_training_args = TrainingArguments(
    output_dir=FINAL_MODEL_DIR,
    num_train_epochs=optimal_epochs, # Train for the optimal number of epochs
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=BODY_LR,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_PERCENTAGE,
    logging_strategy="epoch",
    save_strategy="no", # We will save manually at the end
    report_to="none",
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=NUM_WORKERS,
    seed=GLOBAL_SEED,
    data_seed=GLOBAL_SEED
)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=final_train_dataset,
    # No eval_dataset needed for the final run
    data_collator=data_collator,
)

final_trainer.train()

final_trainer.save_model(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

In [None]:
if os.path.exists("/kaggle"):
    shutil.make_archive(OUT_DIR, 'zip', OUT_DIR)
    # shutil.rmtree(OUT_DIR)