In [170]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# --- 2. CONFIGURATION ---
# This is the only section you need to change between experiments.

# --- EXPERIMENT A: Fine-tune the Generalist (RoBERTa-NLI) ---
# MODEL_CHECKPOINT = "FacebookAI/roberta-large-mnli"
# OUTPUT_DIR = "./results/roberta_nli_finetuned"

# --- EXPERIMENT B: Fine-tune the Specialist (CodeBERT) ---
MODEL_CHECKPOINT = "microsoft/codebert-base"
OUTPUT_DIR = "./results/codebert_finetuned"

# # --- EXPERIMENT C: Fine-tune the Specialist (distilroberta) ---
# MODEL_CHECKPOINT = "j-hartmann/emotion-english-distioberta-base"
# OUTPUT_DIR = "./results/roberta_finetuned"

# --- Universal Parameters ---
DATA_FILE = "/kaggle/input/emollm/augmented_training_data_final.csv" # The path to your 2k labeled dataset
TEXT_COLUMN = "message"
LABEL_COLUMN = "emotion"
TEST_SIZE = 0.2
RANDOM_STATE = 42

In [172]:
# --- 3. DATA PREPARATION ---
print("--- Step 1: Loading and Preparing Data ---")

# Load your dataset
try:
    aug_df = pd.read_csv(DATA_FILE)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print(f"FATAL ERROR: Data file not found at '{DATA_FILE}'. Exiting.")
    exit()

--- Step 1: Loading and Preparing Data ---
Dataset loaded successfully. Shape: (4796, 3)


In [173]:
aug_df.rename(columns = {'reconciled_emotion': 'emotion', 'reconciled_intensity': 'intensity'}, inplace=True)

In [174]:
aug_df.columns

Index(['message', 'emotion'], dtype='object')

In [175]:
print("--- Enforcing data hygiene ---")
# 1. Check for NaN values before cleaning
print(f"Found {aug_df[TEXT_COLUMN].isna().sum()} NaN values in '{TEXT_COLUMN}' column.")
# 2. Convert the entire column to string type to prevent TypeErrors
aug_df[TEXT_COLUMN] = aug_df[TEXT_COLUMN].astype(str)
# 3. Drop any rows where the message is just whitespace or empty
aug_df = aug_df[aug_df[TEXT_COLUMN].str.strip().astype(bool)]
print(f"Shape after cleaning empty/NaN rows: {df.shape}")

--- Enforcing data hygiene ---
Found 24 NaN values in 'message' column.
Shape after cleaning empty/NaN rows: (4796, 3)


In [176]:
# Create the label mappings
# The model needs integer labels, not strings.
labels = aug_df[LABEL_COLUMN].unique().tolist()
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}
NUM_LABELS = len(labels)

print(f"Label mapping created: {label2id}")

# Convert string labels to integer IDs
aug_df['labels'] = aug_df[LABEL_COLUMN].map(label2id)

Label mapping created: {'frustration': 0, 'satisfaction': 1, 'neutral': 2, 'caution': 3}


In [177]:
# Split the data into training and validation sets
train_df, val_df = train_test_split(
    aug_df,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=df['labels'] # Ensures balanced classes in splits
)

In [178]:
original_df = pd.read_csv("/kaggle/input/emollm/labeled_2k.csv")
original_df['labels'] = original_df['reconciled_emotion'].map(label2id)
_, test_df = train_test_split(
    original_df, test_size=0.2, random_state=RANDOM_STATE, stratify=original_df['labels']
)


In [179]:
# Convert pandas DataFrames to Hugging Face Dataset objects
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

print(f"Data split into training and validation sets.")
print(f"Training set size: {len(raw_datasets['train'])}")
print(f"Validation set size: {len(raw_datasets['validation'])}")

Data split into training and validation sets.
Training set size: 3836
Validation set size: 960


In [180]:
# --- 4. TOKENIZATION ---
print("\n--- Step 2: Tokenizing Data ---")

# Load the tokenizer for the specific model we are fine-tuning
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, padding=False)

# Apply the tokenization to the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Data collator will handle dynamic padding during batch creation.
# This is more efficient than padding all examples to the max length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Tokenization complete.")


--- Step 2: Tokenizing Data ---


Map:   0%|          | 0/3836 [00:00<?, ? examples/s]

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Tokenization complete.


In [181]:
# --- 5. MODEL AND TRAINER SETUP ---
print("\n--- Step 3: Loading Model and Defining Training Arguments ---")
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_df['labels']),
    y=train_df['labels'].to_numpy()
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda")
print(f"Calculated class weights: {class_weights}")


--- Step 3: Loading Model and Defining Training Arguments ---
Calculated class weights: tensor([0.9917, 0.9293, 0.7243, 1.8694], device='cuda:0')


In [182]:
class CustomTrainer(Trainer):
    # THE FIX: Add **kwargs to the function signature to accept and ignore
    # any unexpected arguments passed by the Trainer.
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Apply our pre-calculated class weights
        loss_fct = torch.nn.CrossEntropyLoss(
            weight=class_weights
        )
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


In [183]:
print("\n--- Step 4: Loading Model and Defining Training Arguments ---")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Define the evaluation metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    return {"accuracy": accuracy, "f1_macro": f1}


# Define the training arguments
# These are crucial hyperparameters for your experiment.
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=8,                 # A reasonable starting point
    per_device_train_batch_size=8,      # Adjust based on GPU memory
    per_device_eval_batch_size=8,       # Adjust based on GPU memory
    learning_rate=2e-5,                 # A standard learning rate for fine-tuning
    weight_decay=0.01,
    eval_strategy="epoch",        # Evaluate at the end of each epoch
    save_strategy="epoch",              # Save a checkpoint at the end of each epoch
    load_best_model_at_end=True,        # The final model will be the best one found
    metric_for_best_model="f1_macro",   # Use F1 score to determine the "best" model
    push_to_hub=False,                  # Set to True to upload to Hugging Face Hub
    logging_steps=10,
    report_to="none",
    save_total_limit=1,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2 # The number of epochs to wait for improvement
)


# Initialize the Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

print("Model and Trainer initialized. Ready to fine-tune.")


--- Step 4: Loading Model and Defining Training Arguments ---


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and Trainer initialized. Ready to fine-tune.


  trainer = CustomTrainer(


In [None]:
# --- 6. EXECUTE FINE-TUNING ---
print("\n--- Step 4: Starting Fine-Tuning ---")

# This command starts the training process.
# It will print progress and evaluation metrics.
trainer.train()

print("\n--- Fine-Tuning Complete ---")


--- Step 4: Starting Fine-Tuning ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.7286,0.854702,0.621875,0.651912


In [None]:
# --- 7. FINAL EVALUATION ---
print("\n--- Step 5: Final Evaluation on Validation Set ---")

eval_results = trainer.evaluate()
print("Final evaluation results:")
print(eval_results)

# Save the final model and tokenizer
trainer.save_model(f"{OUTPUT_DIR}/final_model")
print(f"Final model saved to {OUTPUT_DIR}/final_model")

In [None]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)


In [None]:
predictions = trainer.predict(test_dataset_tokenized)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = test_dataset_tokenized['labels']
print("--- Generating Confusion Matrix and Classification Report ---")
class_names = list(label2id.keys())

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Emotion')
plt.ylabel('Manually Labeled Emotion')
plt.title('Fine-Tuned CodeBERT Performance')
plt.show()


In [None]:
report = classification_report(true_labels, predicted_labels, target_names=class_names)
print("\nClassification Report:\n")
print(report)