# Fine-tuning albert-base-v2 with LoRA

Upload a CSV named `dataset.csv` with columns `text,label` and set `DATASET_PATH` accordingly.

In [None]:

# If on Colab, uncomment:
# !pip install -U transformers datasets peft accelerate evaluate scikit-learn matplotlib --quiet

import os, numpy as np, pandas as pd, evaluate, matplotlib.pyplot as plt
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import numpy as np

DATASET_PATH = "/content/dataset.csv"  # CSV with columns: text,label
NUM_LABELS = None  # set automatically from data
MODEL_NAME = "albert-base-v2"
OUTPUT_DIR = "outputs_albert_lora"
os.makedirs(OUTPUT_DIR, exist_ok=True)

df = pd.read_csv(DATASET_PATH)
assert "text" in df.columns and "label" in df.columns
labels = sorted(df["label"].astype(str).unique().tolist())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
NUM_LABELS = len(labels)
df["label_id"] = df["label"].astype(str).map(label2id)

# Split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
train_df, val_df  = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["label_id"])

# HF Dataset
train_ds = Dataset.from_pandas(train_df[["text","label_id"]])
val_ds   = Dataset.from_pandas(val_df[["text","label_id"]])
test_ds  = Dataset.from_pandas(test_df[["text","label_id"]])

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(ex):
    out = tokenizer(ex["text"], truncation=True, max_length=256)
    out["labels"] = ex["label_id"]
    return out

train_ds = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_ds   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)
test_ds  = test_ds.map(preprocess, batched=True, remove_columns=test_ds.column_names)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id
)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16, target_modules=["query","value","key","dense","fc1","fc2"],
    lora_dropout=0.1, bias="none", task_type=TaskType.SEQ_CLS
)
model = get_peft_model(base_model, lora_cfg)
model.print_trainable_parameters()

metric_f1 = evaluate.load("f1")
metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}

args = TrainingArguments(
    OUTPUT_DIR, per_device_train_batch_size=16, per_device_eval_batch_size=32,
    learning_rate=2e-4, num_train_epochs=3, weight_decay=0.01,
    evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True,
    logging_steps=50, fp16=True
)

trainer = Trainer(
    model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=collator, compute_metrics=compute_metrics
)

trainer.train()
eval_res = trainer.evaluate(test_ds)
print("Test metrics:", eval_res)

# Confusion matrix
preds = np.argmax(trainer.predict(test_ds).predictions, axis=-1)
cm = confusion_matrix(test_ds["labels"], preds, labels=list(range(NUM_LABELS)))
plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix'); plt.colorbar()
plt.tight_layout(); plt.ylabel('True'); plt.xlabel('Pred')
cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
plt.savefig(cm_path, dpi=150)

# Save final metrics as a small text report
with open(os.path.join(OUTPUT_DIR, "report.txt"), "w") as f:
    for k,v in eval_res.items():
        f.write(f"{k}: {v}\n")
print("Saved report and confusion matrix to", OUTPUT_DIR)
