In [None]:
# =========================================================================================
#
#  Notebook:         Model Training
#  Competition:      Fragments of Feeling: A Contextual Emotion Contest
#  Author:           Md. Abdur Rahman
#
# =========================================================================================
#
# ### Notebook Overview
#
# > Welcome to my training notebook! In this notebook, I will walk you through the entire
# > process of training my emotion classification models for this competition. My strategy
# > revolves around using a powerful pre-trained transformer model, `roberta-large`, and
# > fine-tuning it on the provided data using a robust cross-validation setup. I've also
# > incorporated advanced techniques like Layer-wise Learning Rate Decay (LLRD) to
# > maximize performance. Let's get started!
#
# =========================================================================================


# =========================================================================================
# Imports
# =========================================================================================
#
# > First things first, I'm importing all the necessary libraries. I'll need `pandas` and
# > `numpy` for data manipulation, `torch` for the deep learning framework, and the
# > amazing `transformers` and `datasets` libraries from Hugging Face for the model,
# > tokenizer, and training pipeline. I'm also bringing in `sklearn` for my
# > cross-validation strategy and metrics.
#
# -----------------------------------------------------------------------------------------
import os
import random
import pandas as pd
import numpy as np
import torch
from torch.optim import AdamW
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import gc
import shutil
import warnings

# I'll suppress warnings to keep the output clean.
warnings.filterwarnings("ignore")

# =========================================================================================
# Configuration
# =========================================================================================
#
# > I believe a good practice is to keep all my important parameters and settings in one
# > place. This configuration class, `CFG`, acts as a central control panel for my
# > experiment. It makes it easy to read, modify, and reproduce my work.
#
# -----------------------------------------------------------------------------------------
class CFG:
    # --- Reproducibility ---
    # I'm setting a seed for all random operations to ensure my results are reproducible.
    seed = 42
    
    # --- Paths ---
    # These are the paths to my input data and where I'll save the trained models.
    train_path = "/kaggle/input/fragments-of-feeling/train_emotions.csv"
    output_model_dir = "/kaggle/working/models/" # Directory to save trained models
    
    # --- Model & Tokenizer ---
    # I've chosen 'FacebookAI/roberta-large' as my base model. It's a powerful and
    # well-established model that performed well on this text classification task.
    model_name = 'FacebookAI/roberta-large'
    # I'll cap the sequence length at 256 tokens, which should be sufficient for the
    # sentence lengths in this dataset without being too computationally expensive.
    max_len = 256
    
    # --- Training ---
    # I'm using a 5-fold cross-validation strategy to build a more robust model.
    n_splits = 5
    num_epochs = 5
    train_batch_size = 8
    eval_batch_size = 16
    eval_steps = 500
    
    # --- Optimizer & Scheduler ---
    # These are hyperparameters for my optimizer. I'm using Layer-wise Learning Rate
    # Decay (LLRD), so I have separate learning rates for the encoder and decoder (classifier head).
    weight_decay = 0.01
    encoder_lr = 1e-5
    decoder_lr = 1e-5
    llrd_decay_rate = 0.9
    # I'm also using label smoothing, which can help prevent the model from becoming overconfident.
    label_smoothing_factor = 0.1

# =========================================================================================
# Reproducibility
# =========================================================================================
#
# > To ensure my experiments are consistent, I'm defining this helper function
# > `seed_everything`. It sets the seed for `random`, `numpy`, and `torch` on both
# > CPU and CUDA.
#
# -----------------------------------------------------------------------------------------
def seed_everything(seed):
    """
    Sets the seed for reproducibility.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG.seed)
print(f"Seeds set to {CFG.seed}")

# I need to create the output directory where my trained models will be saved.
os.makedirs(CFG.output_model_dir, exist_ok=True)

# =========================================================================================
# Load and Preprocess Data
# =========================================================================================
#
# > Now, let's get our hands on the data. I'll load the training CSV and perform
# > some essential preprocessing steps to get it ready for the model.
#
# -----------------------------------------------------------------------------------------
print("Loading and preprocessing data...")
train_df = pd.read_csv(CFG.train_path)

# My core feature engineering step is to combine the 'title' and 'sentence' columns.
# I believe the title provides crucial context for the sentence. I'll separate them
# with a `[SEP]` token, which the RoBERTa model understands as a separator.
train_df['text'] = train_df['title'] + ' [SEP] ' + train_df['sentence']
train_df['text'] = train_df['text'].astype(str)

# The `transformers` Trainer expects the target column to be named 'label'.
# I'll rename 'emotion_int' to 'label' for compatibility.
train_df = train_df.rename(columns={'emotion_int': 'label'})

# I need to create mappings between the integer labels and their string representations.
# This is important for the model's configuration and for interpreting the results later.
id2label = dict(enumerate(train_df['emotion_final'].astype('category').cat.categories))
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)

print(f"Number of labels: {num_labels}")
print(f"Labels: {id2label}")

# =========================================================================================
# Cross-Validation Setup
# =========================================================================================
#
# > To build a robust model and get a reliable evaluation of its performance, I'm using
# > k-fold cross-validation. I've chosen `StratifiedKFold` specifically because it
# > ensures that the proportion of each emotion class is the same in each fold. This is
# > crucial for potentially imbalanced datasets like this one.
#
# -----------------------------------------------------------------------------------------
print("Setting up cross-validation folds...")
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)

# I'm creating a new 'fold' column in my DataFrame to assign each row to a validation fold.
train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    train_df.loc[val_idx, 'fold'] = fold

print("Folds created successfully.")
print(train_df['fold'].value_counts())

# Tokenizer, Metrics, and Helper Functions
# =========================================================================================
#
# > Here, I'll set up some core components for the training process. This includes
# > loading the tokenizer and defining my evaluation metric.
#
# -----------------------------------------------------------------------------------------
print(f"Initializing tokenizer: {CFG.model_name}")
# I'm loading the tokenizer that corresponds to my chosen model, `roberta-large`.
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

def compute_metrics(eval_pred):
    # This function is vital for the `Trainer`. It's called at each evaluation step.
    # It takes the model's predictions (logits) and the true labels.
    logits, labels = eval_pred
    # I convert the logits to class predictions by taking the argmax.
    predictions = np.argmax(logits, axis=-1)
    # The competition metric is macro F1-score, so that's what I'll calculate here.
    return {"f1_macro": f1_score(labels, predictions, average="macro")}

def tokenize_function(examples):
    # This is a simple helper function to apply the tokenizer to my text data.
    # `truncation=True` ensures that any text longer than `max_len` is cut off.
    return tokenizer(examples['text'], truncation=True, max_length=CFG.max_len)
    
# =========================================================================================
# Layer-wise Learning Rate Decay (LLRD) 
# =========================================================================================
#
# > This is one of the more advanced techniques I'm using. Instead of a single learning
# > rate for the whole model, Layer-wise Learning Rate Decay (LLRD) allows me to set
# > different learning rates for different layers.
# >
# > The intuition is that the lower layers of a transformer (like embeddings) learn
# > general language features and should be fine-tuned more gently (with a lower LR),
# > while the higher, more task-specific layers can be trained more aggressively (with a higher LR).
#
# -----------------------------------------------------------------------------------------
def get_llrd_optimizer(model, encoder_lr, decoder_lr, weight_decay, decay_rate):
    optimizer_parameters = []
    named_parameters = list(model.named_parameters())
    num_layers = model.config.num_hidden_layers
    
    # I create a list of learning rates that decay exponentially from top to bottom.
    lr_rates = [encoder_lr * (decay_rate ** (num_layers - i)) for i in range(num_layers + 1)]

    # I'll iterate through the model's parameters and assign the appropriate learning rate.
    # Encoder layers (including embeddings) get the decaying learning rates.
    for i, (name, param) in enumerate(named_parameters):
        if "classifier" not in name and "pooler" not in name:
            layer_num = -1
            if "embeddings" in name:
                layer_num = 0
            elif "encoder.layer" in name:
                layer_num = int(name.split("encoder.layer.")[1].split(".")[0]) + 1
            
            if layer_num != -1:
                optimizer_parameters.append({
                    "params": param, "lr": lr_rates[layer_num], "weight_decay": weight_decay
                })

    # The classifier head (the final layers) gets its own separate learning rate.
    for name, param in named_parameters:
        if "classifier" in name or "pooler" in name:
            optimizer_parameters.append({
                "params": param, "lr": decoder_lr, "weight_decay": weight_decay
            })
    # I'll use the AdamW optimizer, which is standard for training transformers.
    return AdamW(optimizer_parameters)

# =========================================================================================
# Training Loop
# =========================================================================================
#
# > This is the main event! I will now loop through each of my 5 cross-validation folds.
# > In each fold, I'll train a new model from scratch, save its best version, and
# > generate out-of-fold (OOF) predictions for later evaluation.
#
# -----------------------------------------------------------------------------------------
oof_preds = np.zeros((len(train_df), num_labels))

for fold in range(CFG.n_splits):
    print("\n" + "="*50)
    print(f"==========       Fold: {fold}       ===========")
    print("="*50)
    
    # --- Data Splitting ---
    # For each fold, I split the data into a training set (all other folds) and a
    # validation set (the current fold).
    train_fold_df = train_df[train_df['fold'] != fold].reset_index(drop=True)
    val_fold_df = train_df[train_df['fold'] == fold].reset_index(drop=True)
    
    # I convert my pandas DataFrames to Hugging Face `Dataset` objects.
    train_ds = Dataset.from_pandas(train_fold_df)
    val_ds = Dataset.from_pandas(val_fold_df)
    
    # --- Tokenization ---
    # I apply my tokenization function to the datasets. I remove the original columns
    # to keep the dataset clean, leaving only the tokenized inputs and the 'label'.
    train_tokenized_dataset = train_ds.map(tokenize_function, batched=True, remove_columns=[c for c in train_ds.column_names if c != 'label'])
    val_tokenized_dataset = val_ds.map(tokenize_function, batched=True, remove_columns=[c for c in val_ds.column_names if c != 'label'])
    
    # --- Model Initialization ---
    # I initialize a fresh `roberta-large` model for each fold. I pass my label
    # mappings so the model knows what it's predicting. `ignore_mismatched_sizes=True`
    # is useful when replacing the classifier head.
    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.model_name, num_labels=num_labels, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
    )

    # --- Training Arguments ---
    # The `TrainingArguments` class is where I define all the settings for the
    # training process. It's highly configurable.
    training_args = TrainingArguments(
        output_dir=f'./temp_results_fold_{fold}',  # Temporary directory for checkpoints
        learning_rate=CFG.encoder_lr, # This is a default, but my custom optimizer will override it.
        per_device_train_batch_size=CFG.train_batch_size, 
        per_device_eval_batch_size=CFG.eval_batch_size,
        num_train_epochs=CFG.num_epochs, 
        weight_decay=CFG.weight_decay, 
        eval_strategy="steps",        # Evaluate periodically during training.
        eval_steps=CFG.eval_steps, 
        save_strategy="steps",        # Save checkpoints periodically.
        save_steps=CFG.eval_steps,
        load_best_model_at_end=True,  # This is key: it loads the best model at the end of training.
        metric_for_best_model="f1_macro", # The metric to determine the "best" model.
        greater_is_better=True,       # A higher F1 score is better.
        report_to="none",             # I'm disabling reporting to services like W&B for this run.
        fp16=True,                    # I'm using mixed-precision training to speed up training and save memory.
        gradient_accumulation_steps=2,# This simulates a larger batch size (8 * 2 = 16) without using more memory.
        save_total_limit=1,           # I only need to save the single best checkpoint.
        label_smoothing_factor=CFG.label_smoothing_factor, # Applying label smoothing.
    )

    # --- LLRD Optimizer ---
    # Here, I instantiate my custom LLRD optimizer.
    llrd_optimizer = get_llrd_optimizer(
        model, CFG.encoder_lr, CFG.decoder_lr, CFG.weight_decay, CFG.llrd_decay_rate
    )
    
    # --- Trainer ---
    # The `Trainer` class from Hugging Face orchestrates the entire training and
    # evaluation process. I'm passing it the model, arguments, datasets, metrics function,
    # and my custom optimizer. The `DataCollatorWithPadding` will dynamically pad
    # sequences in each batch to the longest sequence in that batch.
    trainer = Trainer(
        model=model, 
        args=training_args, 
        train_dataset=train_tokenized_dataset,
        eval_dataset=val_tokenized_dataset, 
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer), 
        optimizers=(llrd_optimizer, None) # I provide my custom optimizer here.
    )

    # --- Train the model ---
    print("Starting training...")
    trainer.train()

    # --- Save the best model for this fold ---
    # After training, the best model (based on f1_macro) is loaded. I save this
    # model to my final output directory.
    fold_model_path = os.path.join(CFG.output_model_dir, f"fold_{fold}")
    # I also save the tokenizer with the model, which is good practice.
    trainer.save_model(fold_model_path)
    tokenizer.save_pretrained(fold_model_path) 
    print(f"Best model for fold {fold} saved to {fold_model_path}")
    
    # --- OOF Predictions ---
    # Now, I'll use the trained model for this fold to predict on its validation set.
    # These are my out-of-fold (OOF) predictions. I store them in my `oof_preds` array.
    print("Generating out-of-fold predictions...")
    val_preds = trainer.predict(val_tokenized_dataset).predictions
    oof_preds[val_fold_df.index] = val_preds

    # --- Cleanup ---
    # To be mindful of memory, especially on Kaggle, I'll clean up before the next fold.
    # I remove the temporary results directory and delete the model and trainer objects.
    shutil.rmtree(f'./temp_results_fold_{fold}')
    del model, trainer, llrd_optimizer
    # I also clear the CUDA cache and run the garbage collector.
    torch.cuda.empty_cache()
    gc.collect()

# =========================================================================================
# OOF Score Calculation
# =========================================================================================
#
# > After the loop has finished training a model for each fold, I can calculate my
# > overall OOF score. This score is calculated using the predictions made for each
# > data point when it was in the validation set. It's a very reliable estimate of
# > how my model will perform on the unseen test data because no model was ever
# > trained on the data it's being evaluated on.
#
# -----------------------------------------------------------------------------------------
print("\n" + "="*50)
print("Calculating Overall OOF Score...")
oof_labels = train_df['label'].values
oof_predictions = np.argmax(oof_preds, axis=1)
overall_f1 = f1_score(oof_labels, oof_predictions, average='macro')
print(f"Overall Out-of-Fold F1 Score: {overall_f1:.5f}")

# =========================================================================================
# Final Words
# =========================================================================================
#
# > That concludes my training process! I have successfully trained 5 separate models, one
# > for each fold, and saved them to disk. My reliable OOF F1 score gives me confidence
# > in this approach.
# >
# > The next step is to create a Kaggle Dataset from the '/kaggle/working/models/' directory.
# > This dataset will then be used in my inference notebook to make predictions on the
# > test set.
#
# -----------------------------------------------------------------------------------------

2025-08-11 06:29:40.941642: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754893781.127288      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754893781.186734      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Seeds set to 42
Loading and preprocessing data...
Number of labels: 8
Labels: {0: 'anger', 1: 'brain dysfunction', 2: 'emptiness', 3: 'hopelessness', 4: 'loneliness', 5: 'sadness', 6: 'suicide intent', 7: 'worthlessness'}
Setting up cross-validation folds...
Folds created successfully.
fold
1    4564
3    4564
2    4564
0    4564
4    4564
Name: count, dtype: int64
Initializing tokenizer: FacebookAI/roberta-large


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]




Map:   0%|          | 0/18256 [00:00<?, ? examples/s]

Map:   0%|          | 0/4564 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss,Validation Loss,F1 Macro
500,1.6637,1.356575,0.544542
1000,1.3261,1.264876,0.583018
1500,1.2358,1.225759,0.609069
2000,1.2063,1.223383,0.616612
2500,1.1371,1.212381,0.62941
3000,1.0976,1.209648,0.634389
3500,1.0636,1.201744,0.64911
4000,1.0051,1.193256,0.65257
4500,0.9969,1.189922,0.651016
5000,0.9399,1.196933,0.649998


Best model for fold 0 saved to /kaggle/working/models/fold_0
Generating out-of-fold predictions...





Map:   0%|          | 0/18256 [00:00<?, ? examples/s]

Map:   0%|          | 0/4564 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss,Validation Loss,F1 Macro
500,1.6423,1.381775,0.523768
1000,1.3305,1.266922,0.587495
1500,1.2228,1.253156,0.601403
2000,1.1877,1.242843,0.613826
2500,1.1401,1.243111,0.613812
3000,1.0854,1.242403,0.625679
3500,1.0579,1.227761,0.626131
4000,0.9911,1.229202,0.63569
4500,0.9805,1.234212,0.634139
5000,0.9438,1.232245,0.637532


Best model for fold 1 saved to /kaggle/working/models/fold_1
Generating out-of-fold predictions...





Map:   0%|          | 0/18256 [00:00<?, ? examples/s]

Map:   0%|          | 0/4564 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss,Validation Loss,F1 Macro
500,1.7847,1.448791,0.422361
1000,1.3466,1.304773,0.559612
1500,1.2444,1.262462,0.594918
2000,1.2,1.24714,0.600867
2500,1.1605,1.234481,0.614209
3000,1.0843,1.233496,0.624653
3500,1.0714,1.241267,0.621818
4000,0.9999,1.223221,0.630947
4500,1.0035,1.223062,0.63227
5000,0.957,1.225738,0.633977


Best model for fold 2 saved to /kaggle/working/models/fold_2
Generating out-of-fold predictions...





Map:   0%|          | 0/18256 [00:00<?, ? examples/s]

Map:   0%|          | 0/4564 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss,Validation Loss,F1 Macro
500,1.6798,1.334916,0.515377
1000,1.3219,1.274589,0.590838
1500,1.2274,1.243061,0.604114
2000,1.1999,1.212981,0.625747
2500,1.132,1.21053,0.622942
3000,1.0955,1.205012,0.628892
3500,1.0676,1.197497,0.637522
4000,0.999,1.206664,0.638898
4500,0.9961,1.192133,0.645265
5000,0.9503,1.196486,0.649908


Best model for fold 3 saved to /kaggle/working/models/fold_3
Generating out-of-fold predictions...





Map:   0%|          | 0/18256 [00:00<?, ? examples/s]

Map:   0%|          | 0/4564 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss,Validation Loss,F1 Macro
500,1.685,1.374036,0.52053
1000,1.3301,1.276313,0.580847
1500,1.2496,1.243703,0.604014
2000,1.1858,1.251642,0.608391
2500,1.1373,1.229368,0.623823
3000,1.0811,1.225147,0.629897
3500,1.0601,1.216304,0.63613
4000,0.9816,1.222151,0.64337
4500,0.9928,1.206807,0.63831
5000,0.9415,1.212418,0.646662


Best model for fold 4 saved to /kaggle/working/models/fold_4
Generating out-of-fold predictions...



Calculating Overall OOF Score...
Overall Out-of-Fold F1 Score: 0.08437

Training complete. All fold models saved.
Models are located in: /kaggle/working/models/
Please create a Kaggle Dataset from this output directory to use in the inference notebook.
