In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score
import numpy as np
from tqdm import tqdm
from scipy.stats import ttest_rel

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and models
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(DEVICE)
fine_tuned_model = PeftModel.from_pretrained(base_model, "./tinyllama_lora7_final").to(DEVICE)
base_model.eval()
fine_tuned_model.eval()

# Load dataset
dataset = load_dataset("json", data_files={"test": "test.jsonl"}, split="test", encoding="utf-8")

# Prompt formatting
def format_prompt(instruction):
    return f"### Instruction:\n{instruction}\n\n### Response:\n"

# Perplexity calculation
@torch.no_grad()
def compute_perplexity(model, dataset, tokenizer, max_length=512):
    total_loss, total_tokens = 0, 0
    for example in tqdm(dataset, desc="Computing Perplexity"):
        prompt = format_prompt(example["instruction"]) + example["output"]
        inputs = tokenizer(prompt, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt").to(DEVICE)
        labels = inputs.input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100
        loss = model(**inputs, labels=labels).loss.item()
        valid_tokens = (labels != -100).sum().item()
        total_loss += loss * valid_tokens
        total_tokens += valid_tokens
    return np.exp(total_loss / total_tokens)

# Response generation
def generate_response(model, instruction, max_new_tokens=200):
    prompt = format_prompt(instruction)
    try:
        inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, top_p=0.9, temperature=0.6)
        return tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:\n")[-1].strip()
    except Exception:
        return "[ERROR: Failed to generate response]"

# Evaluation metrics
def compute_metrics(reference, hypothesis, rouge):
    bleu = sentence_bleu([reference.split()], hypothesis.split(), smoothing_function=SmoothingFunction().method1)
    rouge_scores = rouge.score(reference, hypothesis)
    P, R, F1 = score([hypothesis], [reference], lang="en", model_type="microsoft/deberta-xlarge-mnli", verbose=False)
    return bleu, rouge_scores, F1.item()

# Evaluation process
results = {"base": {}, "fine_tuned": {}}
results["base"]["perplexity"] = compute_perplexity(base_model, dataset, tokenizer)
results["fine_tuned"]["perplexity"] = compute_perplexity(fine_tuned_model, dataset, tokenizer)

# Containers for metrics
bleu_scores = {"base": [], "fine_tuned": []}
rouge_scores = {"base": {"rouge1": [], "rougeL": []}, "fine_tuned": {"rouge1": [], "rougeL": []}}
bert_scores = {"base": [], "fine_tuned": []}
rouge = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

# Main evaluation loop
for example in tqdm(dataset, desc="Evaluating Responses"):
    instruction, reference = example["instruction"], example["output"]
    base_response = generate_response(base_model, instruction)
    ft_response = generate_response(fine_tuned_model, instruction)

    base_bleu, base_rouge, base_bert = compute_metrics(reference, base_response, rouge)
    ft_bleu, ft_rouge, ft_bert = compute_metrics(reference, ft_response, rouge)

    bleu_scores["base"].append(base_bleu)
    bleu_scores["fine_tuned"].append(ft_bleu)
    for key in ["rouge1", "rougeL"]:
        rouge_scores["base"][key].append(base_rouge[key].fmeasure)
        rouge_scores["fine_tuned"][key].append(ft_rouge[key].fmeasure)
    bert_scores["base"].append(base_bert)
    bert_scores["fine_tuned"].append(ft_bert)

# Aggregate scores
for key in ["bleu", "rouge1", "rougeL", "bertscore"]:
    if key == "bleu":
        results["base"][key] = np.mean(bleu_scores["base"])
        results["fine_tuned"][key] = np.mean(bleu_scores["fine_tuned"])
    elif key.startswith("rouge"):
        results["base"][key] = np.mean(rouge_scores["base"][key])
        results["fine_tuned"][key] = np.mean(rouge_scores["fine_tuned"][key])
    elif key == "bertscore":
        results["base"][key] = np.mean(bert_scores["base"])
        results["fine_tuned"][key] = np.mean(bert_scores["fine_tuned"])

# Print final results
print("\nQuantitative Evaluation Results:")
for model in ["base", "fine_tuned"]:
    print(f"{model.title()} Model:")
    for metric in results[model]:
        print(f"  {metric.capitalize()}: {results[model][metric]:.4f}")

# Paired t-tests
print("\nStatistical Significance (Paired t-tests):")
t_bleu, p_bleu = ttest_rel(bleu_scores["base"], bleu_scores["fine_tuned"])
print(f"BLEU p-value: {p_bleu:.4f}")
t_rouge1, p_rouge1 = ttest_rel(rouge_scores["base"]["rouge1"], rouge_scores["fine_tuned"]["rouge1"])
print(f"ROUGE-1 p-value: {p_rouge1:.4f}")
t_bert, p_bert = ttest_rel(bert_scores["base"], bert_scores["fine_tuned"])
print(f"BERTScore p-value: {p_bert:.4f}")

# Robustness Test (Out-of-Domain Prompts)
print("\nRobustness Test (Out-of-Domain Instructions):")
ood_questions = [
    "How can CSS be optimized to reduce carbon emissions?",
    "How does video loading impact a webpage's energy consumption?",
    "What is a carbon footprint, and how can it be reduced in web design?"
]
for i, q in enumerate(ood_questions):
    base = generate_response(base_model, q)
    ft = generate_response(fine_tuned_model, q)
    print(f"\nQuestion {i+1}: {q}")
    print(f"Base: {base}")
    print(f"Fine-tuned: {ft}")
