In [None]:
# 📦 INSTALL REQUIRED LIBRARIES
!pip install -U datasets fsspec
!pip install -q transformers sentence-transformers scikit-learn torch matplotlib seaborn

# 📥 IMPORTS
from datasets import load_dataset
from transformers import pipeline, set_seed
from sentence_transformers import SentenceTransformer, util
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch

# 🌱 CONFIG
set_seed(42)
THRESHOLD = 0.7
NUM_QUESTIONS = 300

# 📊 LOAD DATASET
dataset = load_dataset("truthful_qa", "generation")
questions = dataset["validation"]["question"][:NUM_QUESTIONS]
true_answers = dataset["validation"]["best_answer"][:NUM_QUESTIONS]

# 🧠 LOAD EMBEDDING MODEL
embed_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# 🔁 FUNCTION: Generate answers
def generate_answers(model_name, questions):
    print(f"🔄 Generating with: {model_name}")
    generator = pipeline("text-generation", model=model_name)
    return [generator(q, max_length=100, num_return_sequences=1)[0]["generated_text"] for q in questions]

# 🧠 Models to compare
models = {
    "GPT-2": "gpt2",
    "GPT-Neo": "EleutherAI/gpt-neo-125M"
}

# 📂 Store results
results = {}

# 🔁 For each model
for name, model_path in models.items():
    print(f"\n🚀 Processing {name}")
    generated_answers = generate_answers(model_path, questions)

    # Encode with Sentence-BERT
    emb_gen = embed_model.encode(generated_answers, convert_to_tensor=True)
    emb_true = embed_model.encode(true_answers, convert_to_tensor=True)
    cosine_scores = util.cos_sim(emb_gen, emb_true).diagonal().cpu().numpy()

    # Label: 1 = truthful, 0 = hallucinated
    labels = [1 if s >= THRESHOLD else 0 for s in cosine_scores]
    features = cosine_scores.reshape(-1, 1)

    # Train classifier
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_score = clf.predict_proba(X_test)[:, 1]

    # Save all data
    results[name] = {
        "similarity": cosine_scores,
        "labels": labels,
        "X_test": X_test,
        "y_test": y_test,
        "y_pred": y_pred,
        "y_score": y_score,
        "clf": clf
    }

    # Evaluation
    print(f"\n📊 Evaluation for {name}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Confusion Matrix Heatmap
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# 📊 Similarity Distribution Plot
plt.figure(figsize=(10, 5))
for name in results:
    plt.hist(results[name]["similarity"], bins=20, alpha=0.5, label=name)
plt.axvline(THRESHOLD, color='red', linestyle='--', label='Threshold = 0.7')
plt.title("Cosine Similarity Distribution per Model")
plt.xlabel("Cosine Similarity")
plt.ylabel("Count")
plt.legend()
plt.show()

# 📈 ROC Curve Comparison
plt.figure(figsize=(8, 6))
for name in results:
    y_test = results[name]["y_test"]
    y_score = results[name]["y_score"]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


In [None]:
# 📈 ROC Curve Comparison with Distinct Styles
plt.figure(figsize=(8, 6))

colors = ['blue', 'green', 'orange', 'purple', 'black']
linestyles = ['-', '--', '-.', ':', (0, (5, 1))]

for i, (name, data) in enumerate(results.items()):
    y_test = data["y_test"]
    y_score = data["y_score"]

    # Skip models with only one class in test set
    if len(set(y_test)) < 2:
        print(f"Skipping ROC for {name} due to class imbalance.")
        continue

    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    print(f"ROC AUC for {name}: {roc_auc:.4f}")

    # Distinct style per model
    plt.plot(fpr, tpr,
             label=f'{name} (AUC = {roc_auc:.2f})',
             color=colors[i % len(colors)],
             linestyle=linestyles[i % len(linestyles)],
             linewidth=2)

# Diagonal reference line
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Baseline')

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison for LLMs")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
