<a href="https://colab.research.google.com/github/agataskrzyniarz1/chatbot-intent-detection-lora-finetuning/blob/main/lora_intent_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline, EarlyStoppingCallback
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
from huggingface_hub import login

In [None]:
#drive.mount('/content/drive')

# Prepare the dataset

In [None]:
dataset = load_dataset("tanaos/synthetic-intent-classifier-dataset-v1")

In [None]:
print(dataset)

In [None]:
split1 = dataset['train'].train_test_split(test_size=0.2, seed=42)  # 80/20
split2 = split1['test'].train_test_split(test_size=0.5, seed=42)     # 10/10

datasets_final = DatasetDict({
    'train': split1['train'],
    'validation': split2['train'],
    'test': split2['test']
})

print(datasets_final)

In [None]:
dataset["train"][:10]

In [None]:
datasets_final["train"].features["labels"]

In [None]:
# label categories

id2label = {
    0: "greeting",
    1: "farewell",
    2: "thank_you",
    3: "affirmation",
    4: "negation",
    5: "small_talk",
    6: "bot_capabilities",
    7: "feedback_positive",
    8: "feedback_negative",
    9: "clarification",
    10: "suggestion",
    11: "language_change"
}

label2id = {v: k for k, v in id2label.items()}


# Load the base model

In [None]:
device = torch.device('cuda')

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

num_labels=12

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=12,
    id2label=id2label,
    label2id=label2id # adding label categories
).to(device)

In [None]:
# sanity check
i = 0
ex = datasets_final["train"][i]

print("TEXT:", ex["text"])
print("LABEL ID:", ex["labels"])
print("LABEL NAME:", id2label[ex["labels"]])

# Tokenization

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

tokenized_datasets = datasets_final.map(
    preprocess_function, batched=True, remove_columns=['text']
)


In [None]:
tokenized_datasets

In [None]:
# sanity check
i = 0
example = tokenized_datasets["train"][i]
print(example)


# LoRA configuration

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # sequence classification
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


# Training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./checkpoints/intent-lora",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
    save_total_limit=2,
    report_to="none",
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


# Data collator and trainer

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)




# Train the model

In [None]:
trainer.train()

trainer.save_model("./best_model/intent-lora")
tokenizer.save_pretrained("./best_model/intent-lora")


In [None]:
# save the model on drive
#!cp -r ./best_model/intent-lora /content/drive/MyDrive/intent_detection_fine_tuning/

### Quick evaluation

In [None]:
model.eval()
text = "do you speak chinese?"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)
    pred_id = torch.argmax(outputs.logits, dim=-1).item()

pred_label = id2label[pred_id]

print("Predicted label:", pred_label)


# Test set evaluation

In [None]:
# categories
label_names = [
    "greeting",
    "farewell",
    "thank_you",
    "affirmation",
    "negation",
    "small_talk",
    "bot_capabilities",
    "feedback_positive",
    "feedback_negative",
    "clarification",
    "suggestion",
    "language_change"
]

# test set predictions
preds_output = trainer.predict(tokenized_datasets['test'])

y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

# classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_names))

# confusion matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=label_names, yticklabels=label_names, cmap="Blues")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.title("Confusion Matrix")
plt.show()


# Push to Huggingface Hub

In [None]:
login()

### Merge the base model with the LoRA adapter

In [None]:
# load the base model
model_base = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=12,
    id2label=id2label,
    label2id=label2id
).to(device)

# load the adapter
adapter = PeftModel.from_pretrained(model_base, "./best_model/intent-lora")

# merge base model and adapter
model_merged = adapter.merge_and_unload()

In [None]:
#push to hub
model_merged.push_to_hub("agataskrzyniarz/intent-detection-chatbot", use_auth_token=True)
tokenizer.push_to_hub("agataskrzyniarz/intent-detection-chatbot")