In [None]:
%pip install -q transformers accelerate peft bitsandbytes datasets rouge-score huggingface_hub matplotlib textblob

from huggingface_hub import notebook_login
notebook_login()

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')  

from textblob import download_corpora
download_corpora.download_all()



In [None]:
# === Core Python ===
import os
import time
from collections import defaultdict

# === Data Handling ===
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# === NLP / Metrics ===
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from textblob import TextBlob

# === HuggingFace Datasets ===
from datasets import load_dataset

# === Transformers & Training ===
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainerCallback
)

# === PEFT (Parameter-Efficient Fine-Tuning) ===
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PromptTuningConfig
)


# Data preprocessing

In [None]:
df = pd.read_csv("data/support_data.csv", dtype={
    "tweet_id": str,
    "in_response_to_tweet_id": str
})

df = df.dropna(subset=["text", "in_response_to_tweet_id", "tweet_id"])

df["tweet_id"] = df["tweet_id"].astype(str)
df["in_response_to_tweet_id"] = df["in_response_to_tweet_id"].astype(str)

msg_lookup = df.set_index("tweet_id")["text"].to_dict()
inbound_lookup = df.set_index("tweet_id")["inbound"].to_dict()

pairs = []
for _, row in df.iterrows():
    msg_id = row["tweet_id"]
    in_response_to = row["in_response_to_tweet_id"]

    if not in_response_to in msg_lookup:
        continue

    if row["inbound"] == False and inbound_lookup.get(in_response_to) == True:
        customer_msg = msg_lookup[in_response_to]
        brand_reply = row["text"]
        pairs.append({"input": customer_msg, "output": brand_reply})

print(f"✅ Collected {len(pairs)} input-output pairs")


df_clean = pd.DataFrame(pairs).sample(frac=1.0, random_state=42).iloc[:25000]

train_df, temp_df = train_test_split(df_clean, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

os.makedirs("data", exist_ok=True)
train_df.to_json("data/train.json", orient="records", lines=True)
val_df.to_json("data/val.json", orient="records", lines=True)
test_df.to_json("data/test.json", orient="records", lines=True)

print(f"✅ Saved: {len(train_df)} train, {len(val_df)} val, {len(test_df)} test")


# Model setup

In [None]:
model_name = "mistralai/Mistral-7B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


# inference logic and engine

In [None]:

keywords = [
    "cancel", "refund", "charge", "billing", "dispute", "issue",
    "angry", "unacceptable", "lawsuit", "legal", "scam", "fraud",
    "complaint", "escalate", "speak to manager", "terrible", "disappointed"
]

def score_priority(user_input):
    score = sum(word in user_input.lower() for word in keywords)
    if score >= 3:
        return "HIGH"
    elif score == 2:
        return "MEDIUM"
    else:
        return "LOW"

def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity < -0.3:
        return "NEGATIVE"
    elif polarity > 0.3:
        return "POSITIVE"
    else:
        return "NEUTRAL"

def generate_response(user_input, model, tokenizer):
    priority = score_priority(user_input)
    sentiment = analyze_sentiment(user_input)

    prompt = (
        f"[SYSTEM: This agent must reply in a calm, helpful tone.]\n"
        f"[PRIORITY: {priority}] [SENTIMENT: {sentiment}] "
        f"Customer message: {user_input}\nAgent response:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=100)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response  


# training logging

In [None]:
class LossTracker(TrainerCallback):
    def __init__(self):
        self.logs = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            self.logs.append(logs)

qlora_callback_r8 = LossTracker()
qlora_callback_r4 = LossTracker()
prompt_callback_20 = LossTracker()
prompt_callback_10 = LossTracker()

# Fine tuning method 1: qlora

In [None]:
# -- Shared: Preprocess and Dataset --
train_data = load_dataset("json", data_files="data/train.json")['train']
val_data = load_dataset("json", data_files="data/val.json")['train']

def preprocess(example):
    prompt = f"Customer message: {example['input']}\nAgent response: {example['output']}"
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

train_data = train_data.map(preprocess)
val_data = val_data.map(preprocess)

# -- Training Args (shared) --
training_args = TrainingArguments(
    output_dir="./models/qlora/",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    save_steps=200,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    save_total_limit=1
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# -- Config 1: QLoRA with r=8 --
qlora_callback_r8 = LossTracker()
qlora_model_r8 = prepare_model_for_kbit_training(base_model)
lora_config_r8 = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
qlora_model_r8 = get_peft_model(qlora_model_r8, lora_config_r8)

trainer_r8 = Trainer(
    model=qlora_model_r8,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    callbacks=[qlora_callback_r8]
)
trainer_r8.train()

# -- Config 2: QLoRA with r=4 (ablation) --
qlora_callback_r4 = LossTracker()
qlora_model_r4 = prepare_model_for_kbit_training(base_model)
lora_config_r4 = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
qlora_model_r4 = get_peft_model(qlora_model_r4, lora_config_r4)

trainer_r4 = Trainer(
    model=qlora_model_r4,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    callbacks=[qlora_callback_r4]
)
trainer_r4.train()

# -- Plot both loss curves --
plot_loss(qlora_callback_r8.logs, "QLoRA (r=8)")
plot_loss(qlora_callback_r4.logs, "QLoRA (r=4)")


# fine tuning method 2: prompt tuning

In [None]:

# -- Config 1: 20 virtual tokens --
prompt_config_20 = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init="TEXT",
    num_virtual_tokens=20,
    tokenizer_name_or_path=model_name
)
model_prompt_20 = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model_prompt_20 = get_peft_model(model_prompt_20, prompt_config_20)
prompt_callback_20 = LossTracker()
trainer_prompt_20 = Trainer(
    model=model_prompt_20,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    callbacks=[prompt_callback_20]
)
trainer_prompt_20.train()

# -- Config 2: 10 virtual tokens (ablation) --
prompt_config_10 = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init="TEXT",
    num_virtual_tokens=10,
    tokenizer_name_or_path=model_name
)
model_prompt_10 = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model_prompt_10 = get_peft_model(model_prompt_10, prompt_config_10)
prompt_callback_10 = LossTracker()
trainer_prompt_10 = Trainer(
    model=model_prompt_10,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    callbacks=[prompt_callback_10]
)
trainer_prompt_10.train()

In [None]:
def plot_loss(logs, title):
    steps = [x["step"] for x in logs if "loss" in x]
    losses = [x["loss"] for x in logs if "loss" in x]
    plt.plot(steps, losses, label="Train Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.title(title)
    plt.grid()
    plt.show()

plot_loss(qlora_callback_r8.logs, "QLoRA (r=8) Training Loss")
plot_loss(qlora_callback_r4.logs, "QLoRA (r=4) Training Loss")
plot_loss(prompt_callback_20.logs, "Prompt Tuning (20 Tokens) Training Loss")
plot_loss(prompt_callback_10.logs, "Prompt Tuning (10 Tokens) Training Loss")


# Evaluation (pre and post fine-tuning)

In [None]:
def evaluate_model(model, name):
    test_data = load_dataset("json", data_files="data/test.json")['train']
    total_bleu = 0
    total_rouge = {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
    start = time.time()

    for ex in test_data:
        prompt = f"Customer message: {ex['input']}\nAgent response:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        output = model.generate(**inputs, max_new_tokens=100)
        pred = tokenizer.decode(output[0], skip_special_tokens=True)

        total_bleu += sentence_bleu([ex['output'].split()], pred.split())
        scores = rouge.get_scores(pred, ex['output'])[0]
        for k in total_rouge:
            total_rouge[k] += scores[k]["f"]

    total_time = time.time() - start
    n = len(test_data)

    metrics = {
        "BLEU": total_bleu / n,
        "ROUGE-1": total_rouge["rouge-1"] / n,
        "ROUGE-2": total_rouge["rouge-2"] / n,
        "ROUGE-L": total_rouge["rouge-l"] / n,
        "Latency": total_time / n
    }

    print(f"--- {name} MODEL EVALUATION ---")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")
    
    return metrics


results = {
    "Base": evaluate_model(base_model, name="Base"),
    "QLoRA-8": evaluate_model(qlora_model_r8, name="QLoRA-Tuned (r=8)"),
    "QLoRA-4": evaluate_model(qlora_model_r4, name="QLoRA-Tuned (r=4)"),
    "Prompt-20": evaluate_model(model_prompt_20, name="Prompt-Tuned-20"),
    "Prompt-10": evaluate_model(model_prompt_10, name="Prompt-Tuned-10")
}

# aggregating metrics and visualization

## metric plotting

In [None]:
df = pd.DataFrame(results).T[["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L"]]

# Plot
df.plot(kind="bar", figsize=(10, 6), ylim=(0, 1))
plt.title("LLM Fine-Tuning Performance Comparison")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.grid(axis="y")
plt.legend(loc="lower right")
plt.show()


## sample output comparison

In [None]:
def show_comparisons(n):
    test_data = load_dataset("json", data_files="data/test.json")['train']
    for i in range(n):
        ex = test_data[i]
        prompt = f"Customer message: {ex['input']}\nAgent response:"
        inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)

        def get_response(model):
            output = model.generate(**inputs, max_new_tokens=100)
            return tokenizer.decode(output[0], skip_special_tokens=True)
            
        print(f"\nInput: {ex['input']}")
        print(f"Target: {ex['output']}")
        print(f"Base: {get_response(base_model)}")
        print(f"QLoRA-8: {get_response(qlora_model_r8)}")
        print(f"QLoRA-4: {get_response(qlora_model_r4)}")
        print(f"Prompt-20: {get_response(model_prompt_20)}")
        print(f"Prompt-10: {get_response(model_prompt_10)}")
        print("-" * 60)

show_comparisons(3)


## latency table

In [None]:
latencies = {name: round(m["Latency"], 2) for name, m in results.items()}
print("⏱️ Inference Latency per Sample (seconds):")
for k, v in latencies.items():
    print(f"{k}: {v}s")


## trainable parameter count comparisons (why qlora and prompt tuning are effecient)

In [None]:
def print_trainable_params(model, label):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    pct = (trainable / total) * 100
    print(f"{label} - Trainable params: {trainable:,} / {total:,} ({pct:.4f}%)")

print_trainable_params(qlora_model_r8, "QLoRA (r=8)")
print_trainable_params(qlora_model_r4, "QLoRA (r=4)")
print_trainable_params(model_prompt_20, "Prompt Tuning (20)")
print_trainable_params(model_prompt_10, "Prompt Tuning (10)")


## response category breakdown (error/response breakdown based on input intent)

In [None]:
def categorize_input(text):
    if any(k in text.lower() for k in ["cancel", "angry", "bad", "refund", "issue"]):
        return "complaint"
    elif "?" in text:
        return "question"
    else:
        return "other"

test_data = load_dataset("json", data_files="data/test.json")["train"]
test_data = test_data.map(lambda x: {"category": categorize_input(x["input"])})

def evaluate_by_category(model, name):
    rouge = Rouge()
    scores = defaultdict(lambda: {"bleu": [], "rouge-l": []})
    
    for ex in test_data:
        prompt = f"Customer message: {ex['input']}\nAgent response:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        output = model.generate(**inputs, max_new_tokens=100)
        pred = tokenizer.decode(output[0], skip_special_tokens=True)
        scores[ex["category"]]["bleu"].append(sentence_bleu([ex["output"].split()], pred.split()))
        rouge_score = rouge.get_scores(pred, ex["output"])[0]["rouge-l"]["f"]
        scores[ex["category"]]["rouge-l"].append(rouge_score)
    
    print(f"\n{name} Evaluation by Category")
    for cat in scores:
        b = sum(scores[cat]["bleu"]) / len(scores[cat]["bleu"])
        r = sum(scores[cat]["rouge-l"]) / len(scores[cat]["rouge-l"])
        print(f"{cat}: BLEU={b:.4f}, ROUGE-L={r:.4f}")

evaluate_by_category(qlora_model_r8, "QLoRA")
evaluate_by_category(qlora_model_r4, "QLoRA")
evaluate_by_category(model_prompt_20, "Prompt Tuning (20)")
evaluate_by_category(model_prompt_10, "Prompt Tuning (10)")


## token level error breakdown (worst bleu and rogue l samples)

In [None]:

def get_worst_samples(model, tokenizer, test_data, metric="bleu", n=5):
    rouge = Rouge()
    scored_samples = []

    for ex in test_data:
        prompt = f"Customer message: {ex['input']}\nAgent response:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        output = model.generate(**inputs, max_new_tokens=100)
        pred = tokenizer.decode(output[0], skip_special_tokens=True)

        if metric == "bleu":
            score = sentence_bleu([ex["output"].split()], pred.split())
        elif metric == "rouge":
            score = rouge.get_scores(pred, ex["output"])[0]["rouge-l"]["f"]
        else:
            raise ValueError("Unsupported metric.")

        scored_samples.append((score, ex["input"], ex["output"], pred))

    scored_samples.sort(key=lambda x: x[0])
    return scored_samples[:n]

# Load test data
test_data = load_dataset("json", data_files="data/test.json")["train"]

# Worst samples by BLEU
worst_bleu_qlora_r8 = get_worst_samples(qlora_model_r8, tokenizer, test_data, metric="bleu", n=5)
worst_bleu_qlora_r4 = get_worst_samples(qlora_model_r4, tokenizer, test_data, metric="bleu", n=5)
worst_bleu_prompt_20 = get_worst_samples(model_prompt_20, tokenizer, test_data, metric="bleu", n=5)
worst_bleu_prompt_10 = get_worst_samples(model_prompt_10, tokenizer, test_data, metric="bleu", n=5)

# Worst samples by ROUGE-L
worst_rouge_qlora_r8 = get_worst_samples(qlora_model_r8, tokenizer, test_data, metric="rouge", n=5)
worst_rouge_qlora_r4 = get_worst_samples(qlora_model_r4, tokenizer, test_data, metric="rouge", n=5)
worst_rouge_prompt_20 = get_worst_samples(model_prompt_20, tokenizer, test_data, metric="rouge", n=5)
worst_rouge_prompt_10 = get_worst_samples(model_prompt_10, tokenizer, test_data, metric="rouge", n=5)

# Display results
def print_worst_cases(samples, method, metric):
    print(f"\n--- Worst {metric.upper()} Cases for {method} ---")
    for score, inp, target, pred in samples:
        print(f"🔹 Input: {inp}")
        print(f"✅ Target: {target}")
        print(f"🧠 Prediction: {pred}")
        print(f"📉 {metric.upper()} Score: {score:.4f}")
        print("-" * 60)

# Print BLEU worst cases
print_worst_cases(worst_bleu_qlora_r8, "QLoRA (r=8)", "bleu")
print_worst_cases(worst_bleu_qlora_r4, "QLoRA (r=4)", "bleu")
print_worst_cases(worst_bleu_prompt_20, "Prompt Tuning (20)", "bleu")
print_worst_cases(worst_bleu_prompt_10, "Prompt Tuning (10)", "bleu")

# Print ROUGE worst cases
print_worst_cases(worst_rouge_qlora_r8, "QLoRA (r=8)", "rouge")
print_worst_cases(worst_rouge_qlora_r4, "QLoRA (r=4)", "rouge")
print_worst_cases(worst_rouge_prompt_20, "Prompt Tuning (20)", "rouge")
print_worst_cases(worst_rouge_prompt_10, "Prompt Tuning (10)", "rouge")
