In [None]:
import os
import sys
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_from_disk
from transformers import AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from seqeval.metrics import f1_score as ner_f1_score
from seqeval.scheme import IOB2 # For entity evaluation

In [None]:
# Import our custom model architecture
# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..', 'src', 'schedulebot', 'nlu'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)

# Now you can import the desired function or class
from multitask_model import MultitaskModel

## Setup

In [None]:
# --- Login and Configuration ---
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

hub_model_id = os.getenv("HUB_MODEL_ID")

In [None]:
# --- Load Tokenizer ---
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# --- Load Processed Dataset ---
project_root = str(Path().cwd().resolve().parent)
dataset_dir = os.path.join(project_root, "data", "processed")
processed_datasets = load_from_disk(os.path.join(dataset_dir, "hasd_processed"))
print(processed_datasets)

## Custom Metrics Function
This function is essential for a multitask model. It will be called by the `Trainer` at the end of each epoch to calculate both intent accuracy and NER F1-score.

In [None]:
def compute_metrics(eval_pred):
    # Unpack predictions and labels
    predictions, label_values = eval_pred
    intent_preds, ner_preds = predictions
    intent_labels, ner_labels = label_values

    # --- Intent Metrics ---
    intent_preds = np.argmax(intent_preds, axis=1)
    intent_accuracy = accuracy_score(intent_labels, intent_preds)
    intent_f1 = f1_score(intent_labels, intent_preds, average='weighted')

    # --- NER Metrics ---
    ner_preds = np.argmax(ner_preds, axis=2)

    # Remove padding tokens (where label is -100) and convert IDs to labels
    true_ner_labels = []
    true_ner_predictions = []
    id2ner = processed_datasets['train'].features['labels'].feature.names

    for i in range(len(ner_labels)):
        true_labels_row = []
        true_predictions_row = []
        for j in range(len(ner_labels[i])):
            if ner_labels[i][j] != -100:
                true_labels_row.append(id2ner[ner_labels[i][j]])
                true_predictions_row.append(id2ner[ner_preds[i][j]])
        true_ner_labels.append(true_labels_row)
        true_ner_predictions.append(true_predictions_row)

    ner_f1 = ner_f1_score(true_ner_labels, true_ner_predictions, mode='strict', scheme=IOB2)

    return {
        "intent_accuracy": intent_accuracy,
        "intent_f1": intent_f1,
        "ner_f1": ner_f1
    }

## Instantiate the model
We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.

In [None]:
# --- Create Label Mappings Directly from Data ---

# 1. Intent Labels
# Get a sorted list of unique intent strings from the training set
intent_label_list = processed_datasets['train'].features['intent_label'].names
# Create the mappings
id2intent = {i: label for i, label in enumerate(intent_label_list)}
intent2id = {label: i for i, label in enumerate(intent_label_list)}

print("--- Intent Vocab ---")
print(f"Number of intents: {len(id2intent)}")
print(f"Mapping (id2intent): {id2intent}")
print(f"Mapping (intent2id): {intent2id}\n")


# 2. NER Labels
# Get a sorted list of unique entities strings from the training set
ner_label_list = processed_datasets['train'].features['labels'].feature.names
# Create the mappings
id2ner = {i: label for i, label in enumerate(ner_label_list)}
ner2id = {label: i for i, label in enumerate(ner_label_list)}

print("--- NER Vocab ---")
print(f"Number of NER tags: {len(id2ner)}")
print(f"Mapping (id2ner): {id2ner}")
print(f"Mapping (ner2id): {ner2id}")

In [None]:
config = AutoConfig.from_pretrained(model_name)

# Add custom parameters to config object
# This ensures they are saved in config.json on the Hub
config.num_intent_labels = len(id2intent)
config.num_ner_labels = len(id2ner)
config.id2label_intent = id2intent
config.label2id_intent = intent2id
config.id2label_ner = id2ner
config.label2id_ner = ner2id

model = MultitaskModel(config)

## Training
Finally, we define the `TrainingArguments` and create a `Trainer` instance to handle the training loop.

In [None]:
# --- FREEZE THE BASE MODEL ---
for param in model.transformer.parameters():
    param.requires_grad = False

In [None]:
# Define a data collator to handle padding for token classification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir=os.path.join(project_root, "models", "multitask_model", "training"),
    overwrite_output_dir=True,
    num_train_epochs=200,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    weight_decay=1e-5,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="best",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # --- Hub Arguments ---
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
    hub_token=hf_token,
    report_to="tensorboard"
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

In [None]:
trainer.train()

In [None]:
# Convert to pandas DataFrame
df = pd.DataFrame(trainer.state.log_history)

train_df = df[df["loss"].notna()][['epoch', 'loss']]
eval_df = df[df["eval_loss"].notna()][['epoch', 'eval_loss']]

# --- Plot the loss curves ---
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 6))

# Plot training loss
ax.plot(train_df["epoch"], train_df["loss"], label="Training Loss", color="dodgerblue")

# Plot validation loss
ax.plot(eval_df["epoch"], eval_df["eval_loss"], label="Validation Loss", color="darkorange", linestyle='--')

# --- 4. Customize and show the plot ---
ax.set_title("Training & Validation Loss Curves", fontsize=16)
ax.set_xlabel("Steps", fontsize=12)
ax.set_ylabel("Loss", fontsize=12)
ax.legend(fontsize=12)
ax.grid(True)
plt.tight_layout()

## Fine-tuning

In [None]:
# --- UNFREEZE THE BASE MODEL ---
for param in model.transformer.parameters():
    param.requires_grad = True

In [None]:
# Training Arguments for the fine-tuning stage
training_args_stage2 = TrainingArguments(
    output_dir=os.path.join(project_root, "models", "multitask_model", "finetuning"),
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-6,
    weight_decay=1e-3,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="best",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # --- Hub Arguments ---
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
    hub_token=hf_token,
    report_to="tensorboard"
)

# Create a new Trainer for Stage 2
trainer_stage2 = Trainer(
    model=model,
    args=training_args_stage2,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer_stage2.train()

In [None]:
# Convert to pandas DataFrame
df = pd.DataFrame(trainer_stage2.state.log_history)

train_df = df[df["loss"].notna()][['epoch', 'loss']]
eval_df = df[df["eval_loss"].notna()][['epoch', 'eval_loss']]

# --- Plot the loss curves ---
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 6))

# Plot training loss
ax.plot(train_df["epoch"], train_df["loss"], label="Training Loss", color="dodgerblue")

# Plot validation loss
ax.plot(eval_df["epoch"], eval_df["eval_loss"], label="Validation Loss", color="darkorange", linestyle='--')

# --- 4. Customize and show the plot ---
ax.set_title("Training & Validation Loss Curves", fontsize=16)
ax.set_xlabel("Steps", fontsize=12)
ax.set_ylabel("Loss", fontsize=12)
ax.legend(fontsize=12)
ax.grid(True)
plt.tight_layout()

## Evaluation

In [None]:
# Get predictions from the trainer
test_predictions = trainer_stage2.predict(processed_datasets["test"])

# Unpack the predictions and true labels
intent_preds_logits, ner_preds_logits = test_predictions.predictions
intent_true_labels, ner_true_labels = test_predictions.label_ids

# Get the final predicted class IDs by finding the max logit
intent_pred_labels = np.argmax(intent_preds_logits, axis=1)
ner_pred_labels = np.argmax(ner_preds_logits, axis=2)

In [None]:
# --- Intent Classification Report and Confusion Matrix ---

print("\n--- Intent Classification Report ---")

# Get the human-readable intent names from the dataset features
intent_names = processed_datasets['test'].features['intent_label'].names

# Generate and print the classification report
intent_report = classification_report(intent_true_labels, intent_pred_labels, target_names=intent_names, digits=4)
print(intent_report)

# Generate and plot the confusion matrix
intent_cm = confusion_matrix(intent_true_labels, intent_pred_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(intent_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=intent_names, yticklabels=intent_names)
plt.xlabel('Predicted Intent')
plt.ylabel('True Intent')
plt.title('Intent Classification Confusion Matrix')
plt.show()

In [None]:
# --- NER Report and Confusion Matrix ---

print("\n--- NER (Token Classification) Report ---")

# Get the human-readable NER tag names
ner_tag_names = processed_datasets['test'].features['labels'].feature.names

# Flatten the lists of predictions and labels, ignoring -100 tokens
true_ner_tags_flat = []
pred_ner_tags_flat = []

for i in range(len(ner_true_labels)):
    for j in range(len(ner_true_labels[i])):
        if ner_true_labels[i][j] != -100: # Ignore padding
            true_ner_tags_flat.append(ner_tag_names[ner_true_labels[i][j]])
            pred_ner_tags_flat.append(ner_tag_names[ner_pred_labels[i][j]])

# Generate and print the classification report for NER
ner_report = classification_report(true_ner_tags_flat, pred_ner_tags_flat, digits=4)
print(ner_report)

# Generate and plot the confusion matrix for NER
# Note: This matrix can be large if you have many entity types.
ner_cm = confusion_matrix(true_ner_tags_flat, pred_ner_tags_flat, labels=ner_tag_names)
plt.figure(figsize=(12, 10))
sns.heatmap(ner_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=ner_tag_names, yticklabels=ner_tag_names)
plt.xlabel('Predicted NER Tag')
plt.ylabel('True NER Tag')
plt.title('NER (Token) Confusion Matrix')
plt.xticks(rotation=45)
plt.show()

In [None]:
print("\n--- Pushing final model to Hub ---")
trainer_stage2.push_to_hub()
print(f"✅ Model successfully pushed to https://huggingface.co/{hub_model_id}")