**Dependencies**

In [None]:
!pip install numpy pandas scikit-learn torch sentence-transformers nltk
import nltk

# Existing downloads
nltk.download('punkt')
nltk.download('stopwords')

# Add this for new NLTK versions
nltk.download('punkt_tab')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

**Fine Tunning**

In [None]:
# -------------------------------
# 1️⃣ Install dependencies
# -------------------------------
!pip install transformers datasets faiss-gpu accelerate sentence-transformers tqdm -q

# -------------------------------
# 2️⃣ Imports
# -------------------------------
import random
import torch
from torch import nn, optim
from datasets import load_dataset
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# -------------------------------
# 3️⃣ Load SQuAD v1.1 (20k examples)
# -------------------------------
dataset = load_dataset("squad")
train_data = dataset['train'].select(range(20000))
val_data   = dataset['validation'].select(range(500))
print(f"Training examples: {len(train_data)}, Validation examples: {len(val_data)}")

# -------------------------------
# 4️⃣ Prepare DPR-format examples
# -------------------------------
def prepare_dpr_format(dataset, num_negatives=1):
    examples = []
    all_contexts = [ex['context'] for ex in dataset]

    for ex in dataset:
        question = ex['question']
        positive_ctx = ex['context']
        negatives = random.sample([c for c in all_contexts if c != positive_ctx], num_negatives)
        examples.append({
            'question': question,
            'positive_ctx': positive_ctx,
            'negative_ctxs': negatives
        })
    return examples

train_examples = prepare_dpr_format(train_data)
val_examples = prepare_dpr_format(val_data)

# -------------------------------
# 5️⃣ Load DPR encoders & tokenizers
# -------------------------------
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to(device)
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)

q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# -------------------------------
# 6️⃣ Encode examples on GPU
# -------------------------------
def encode_examples(examples, q_tokenizer, ctx_tokenizer, max_length=512):
    q_inputs = q_tokenizer(
        [ex['question'] for ex in examples],
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(device)
    ctx_inputs = ctx_tokenizer(
        [ex['positive_ctx'] for ex in examples],
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(device)
    return q_inputs, ctx_inputs

train_q, train_ctx = encode_examples(train_examples, q_tokenizer, ctx_tokenizer)
val_q, val_ctx = encode_examples(val_examples, q_tokenizer, ctx_tokenizer)

# -------------------------------
# 7️⃣ Fine-tuning loop on GPU
# -------------------------------
optimizer = optim.Adam(list(q_encoder.parameters()) + list(ctx_encoder.parameters()), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()
batch_size = 8
num_epochs = 2
gradient_accumulation_steps = 2

num_batches = len(train_q['input_ids']) // batch_size + 1

for epoch in range(num_epochs):
    q_encoder.train()
    ctx_encoder.train()
    total_loss = 0

    print(f"\nEpoch {epoch+1}/{num_epochs}")
    for i in tqdm(range(0, len(train_q['input_ids']), batch_size), total=num_batches, desc="Training"):
        q_batch = {k: v[i:i+batch_size].to(device) for k, v in train_q.items()}
        ctx_batch = {k: v[i:i+batch_size].to(device) for k, v in train_ctx.items()}

        q_emb = q_encoder(**q_batch).pooler_output
        ctx_emb = ctx_encoder(**ctx_batch).pooler_output

        scores = torch.matmul(q_emb, ctx_emb.T)
        labels = torch.arange(scores.size(0)).to(device)
        loss = loss_fn(scores, labels) / gradient_accumulation_steps
        loss.backward()

        if (i // batch_size + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * gradient_accumulation_steps

    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1} average training loss: {avg_loss:.4f}")

    # -------------------------------
    # Validation on GPU
    # -------------------------------
    q_encoder.eval()
    ctx_encoder.eval()
    with torch.no_grad():
        q_emb_val = q_encoder(**{k: v.to(device) for k, v in val_q.items()}).pooler_output
        ctx_emb_val = ctx_encoder(**{k: v.to(device) for k, v in val_ctx.items()}).pooler_output
        val_scores = torch.matmul(q_emb_val, ctx_emb_val.T)
        val_labels = torch.arange(val_scores.size(0)).to(device)
        val_loss = loss_fn(val_scores, val_labels).item()
        print(f"Epoch {epoch+1} validation loss: {val_loss:.4f}")

# -------------------------------
# 8️⃣ Save fine-tuned models
# -------------------------------
q_encoder.save_pretrained("/content/dpr_question_encoder_squad_20k_gpu")
ctx_encoder.save_pretrained("/content/dpr_ctx_encoder_squad_20k_gpu")
print("Fine-tuned DPR models saved successfully!")

# -------------------------------
# 9️⃣ Example usage on GPU
# -------------------------------
query = "What is machine learning?"
q_inputs = q_tokenizer(query, return_tensors="pt").to(device)
q_emb = q_encoder(**q_inputs).pooler_output
print("Query embedding shape:", q_emb.shape)


**Saving in disk**

In [None]:
from google.colab import drive
import shutil
import os

# -------------------------------
# 1️⃣ Mount Google Drive
# -------------------------------
drive.mount('/content/drive')

# -------------------------------
# 2️⃣ Define paths (corrected!)
# -------------------------------
q_encoder_path = "/content/dpr_question_encoder_squad_20k_gpu"
ctx_encoder_path = "/content/dpr_ctx_encoder_squad_20k_gpu"

drive_folder = "/content/drive/MyDrive/DPR_Models"
os.makedirs(drive_folder, exist_ok=True)

# -------------------------------
# 3️⃣ Function to zip a folder safely
# -------------------------------
def zip_folder(src_path, dst_folder, zip_name):
    if os.path.exists(src_path) and os.path.isdir(src_path):
        zip_path = os.path.join(dst_folder, f"{zip_name}.zip")
        shutil.make_archive(base_name=zip_path.replace('.zip', ''), format='zip', root_dir=src_path)
        print(f"{zip_name} successfully saved to: {zip_path}")
    else:
        print(f"❌ Folder not found: {src_path}")

# -------------------------------
# 4️⃣ Zip the models
# -------------------------------
zip_folder(q_encoder_path, drive_folder, "dpr_question_encoder_squad_20k_gpu")
zip_folder(ctx_encoder_path, drive_folder, "dpr_ctx_encoder_squad_20k_gpu")


**Indexing**

In [None]:
NUM_TRAIN_EXAMPLES = 85000
NUM_VALIDATION_EXAMPLES = 2000
TOP_K = 7


In [None]:
# ================================
# BM25 + Fine-Tuned DPR + Hybrid Retriever
# ================================

!pip install datasets scikit-learn nltk torch tqdm transformers --quiet

import re, math, os, pickle
import numpy as np
import torch
from collections import Counter, defaultdict
from typing import List
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer

# NLTK setup
nltk.download('punkt')
nltk.download('stopwords')

# ================================
# CONFIG
# ================================
DATASET_SELECTION = "v1"
#NUM_TRAIN_EXAMPLES = 200  # Adjust as needed
PICKLE_FILE = f"retrievers_{DATASET_SELECTION}.pkl"
#TOP_K = 5

CTX_MODEL_PATH = "/content/dpr_ctx_encoder_squad_20k_gpu"
Q_MODEL_PATH = "/content/dpr_question_encoder_squad_20k_gpu"

# ================================
# Text Preprocessor
# ================================
class TextPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text: str) -> str:
        text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
        return ' '.join(text.split())

    def tokenize_and_stem(self, text: str):
        tokens = word_tokenize(self.preprocess_text(text))
        return [self.stemmer.stem(t) for t in tokens if t not in self.stop_words and len(t) > 2]

# ================================
# BM25 Retriever
# ================================
class BM25Retriever:
    def __init__(self, k1=1.5, b=0.75):
        self.k1, self.b = k1, b
        self.preprocessor = TextPreprocessor()

    def fit(self, corpus: List[str]):
        self.corpus = corpus
        self.tokens = [self.preprocessor.tokenize_and_stem(doc) for doc in corpus]
        df = defaultdict(int)
        for tks in self.tokens:
            for tok in set(tks):
                df[tok] += 1
        self.idf = {tok: math.log((len(corpus)-f+0.5)/(f+0.5)+1) for tok,f in df.items()}
        self.doc_len = [len(tks) for tks in self.tokens]
        self.avgdl = np.mean(self.doc_len)

    def get_scores(self, query: str):
        q_toks = self.preprocessor.tokenize_and_stem(query)
        scores = np.zeros(len(self.corpus))
        for i, doc in enumerate(self.tokens):
            freqs = Counter(doc)
            for tok in q_toks:
                if tok in freqs:
                    tf, idf = freqs[tok], self.idf.get(tok,0)
                    denom = tf + self.k1*(1-self.b+self.b*self.doc_len[i]/self.avgdl)
                    scores[i] += idf*(tf*(self.k1+1))/denom
        return scores

    def retrieve(self, query, k=TOP_K):
        s = self.get_scores(query)
        idx = np.argsort(s)[::-1][:k]
        return [{'document': self.corpus[i], 'score': s[i]} for i in idx]

# ================================
# DPR Retriever (Fine-Tuned) - Fixed
# ================================
class DPRRetriever:
    def __init__(self, ctx_model_path=CTX_MODEL_PATH, q_model_path=Q_MODEL_PATH, batch_size=16):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Tokenizers: original pretrained
        self.ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
            "facebook/dpr-ctx_encoder-single-nq-base"
        )
        self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base"
        )

        # Encoders: fine-tuned weights
        self.ctx_encoder = DPRContextEncoder.from_pretrained(ctx_model_path).to(self.device)
        self.q_encoder = DPRQuestionEncoder.from_pretrained(q_model_path).to(self.device)

        self.batch_size = batch_size
        self.max_length = 512  # truncate passages to avoid size mismatch

    def fit(self, corpus: List[str]):
        self.corpus = corpus
        all_embs = []
        total_batches = math.ceil(len(corpus)/self.batch_size)
        with tqdm(total=total_batches, desc="Encoding DPR corpus", unit="batch") as pbar:
            for i in range(0, len(corpus), self.batch_size):
                batch = corpus[i:i+self.batch_size]
                inputs = self.ctx_tokenizer(
                    batch,
                    padding=True,
                    truncation=True,
                    max_length=self.max_length,
                    return_tensors="pt"
                ).to(self.device)

                with torch.no_grad():
                    emb = self.ctx_encoder(**inputs).pooler_output
                all_embs.append(emb.cpu())
                pbar.update(1)

        self.embeddings = torch.cat(all_embs, dim=0)

    def retrieve(self, query, k=TOP_K):
        inputs = self.q_tokenizer(
            [query],
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            q_emb = self.q_encoder(**inputs).pooler_output.cpu()

        sims = cosine_similarity(q_emb.numpy(), self.embeddings.numpy())[0]
        idx = np.argsort(sims)[::-1][:k]
        return [{'document': self.corpus[i], 'score': sims[i]} for i in idx]

# ================================
# Hybrid Retriever
# ================================
class HybridRetriever:
    def __init__(self, bm25: BM25Retriever, dpr: DPRRetriever, alpha: float = 0.5):
        self.bm25 = bm25
        self.dpr = dpr
        self.alpha = alpha

    def retrieve(self, query, k=TOP_K):
        bm25_res = self.bm25.retrieve(query, k=len(self.bm25.corpus))
        dpr_res = self.dpr.retrieve(query, k=len(self.dpr.corpus))
        combined_scores = {}
        for r in bm25_res:
            combined_scores[r['document']] = combined_scores.get(r['document'], 0) + (1-self.alpha)*r['score']
        for r in dpr_res:
            combined_scores[r['document']] = combined_scores.get(r['document'], 0) + self.alpha*r['score']
        sorted_docs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        return [{'document': doc, 'score': score} for doc, score in sorted_docs]

# ================================
# Load / Index
# ================================
if os.path.exists(PICKLE_FILE):
    with open(PICKLE_FILE, "rb") as f:
        retrievers = pickle.load(f)
    print(f"✅ Loaded retrievers from {PICKLE_FILE}. You can skip indexing.")
else:
    print("Loading SQuAD dataset...")
    squad_train = load_dataset("squad", split=f"train[:{NUM_TRAIN_EXAMPLES}]")
    docs = [item['context'] for item in squad_train]

    print("Indexing BM25...")
    bm25 = BM25Retriever()
    bm25.fit(docs)

    print("Indexing fine-tuned DPR...")
    dpr = DPRRetriever()
    dpr.fit(docs)

    print("Creating Hybrid Retriever...")
    hybrid = HybridRetriever(bm25, dpr, alpha=0.5)

    retrievers = {"bm25": bm25, "dpr": dpr, "hybrid": hybrid}

    with open(PICKLE_FILE, "wb") as f:
        pickle.dump(retrievers, f)
    print(f"✅ Retrievers indexed and saved to {PICKLE_FILE}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading SQuAD dataset...
Indexing BM25...
Indexing fine-tuned DPR...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Encoding DPR corpus: 100%|██████████| 5000/5000 [18:54<00:00,  4.41batch/s]


Creating Hybrid Retriever...
✅ Retrievers indexed and saved to retrievers_v1.pkl


**Retriver Only comparison**

In [None]:
import pickle
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ================================
# Configuration
# ================================
DATASET_SELECTION = "v1"
#NUM_VALIDATION_EXAMPLES = 10
#TOP_K = 5
PICKLE_FILE = "retrievers_v1.pkl"
SEMANTIC_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
SEM_SIM_THRESHOLD = 0.6  # Threshold for hit

# ================================
# Load retrievers
# ================================
with open(PICKLE_FILE, "rb") as f:
    retrievers = pickle.load(f)
print(f"✅ Retrievers loaded from {PICKLE_FILE}")

# ================================
# Load validation data
# ================================
val_data = []
if DATASET_SELECTION in ["v1", "both"]:
    squad_v1_val = load_dataset("squad", split=f"validation[:{NUM_VALIDATION_EXAMPLES}]")
    val_data.extend([("v1", item) for item in squad_v1_val])

if DATASET_SELECTION in ["v2", "both"]:
    squad_v2_val = load_dataset("squad_v2", split=f"validation[:{NUM_VALIDATION_EXAMPLES}]")
    val_data.extend([("v2", item) for item in squad_v2_val])

# ================================
# Normalize scores function
# ================================
def normalize_scores(scores):
    if len(scores) == 0:
        return scores
    min_s, max_s = min(scores), max(scores)
    if max_s - min_s == 0:
        return [0.0] * len(scores)
    return [(s - min_s) / (max_s - min_s) for s in scores]

# ================================
# Load semantic similarity model
# ================================
semantic_model = SentenceTransformer(SEMANTIC_MODEL)

# ================================
# Evaluation
# ================================
results = []

for dataset_name, example in tqdm(val_data, desc="Evaluating queries"):
    question = example['question']
    gold_answer = example['answers']['text'][0] if example['answers']['text'] else ""

    for retriever_type in ["bm25", "dpr", "hybrid"]:
        retriever = retrievers.get(retriever_type)
        if retriever is None:
            continue

        retrieved_docs = retriever.retrieve(question, k=TOP_K)
        raw_scores = [r['score'] for r in retrieved_docs]
        norm_scores = normalize_scores(raw_scores)

        for rank, (r, score) in enumerate(zip(retrieved_docs, norm_scores), 1):
            # Compute semantic similarity
            sem_sim = cosine_similarity(
                semantic_model.encode([gold_answer]),
                semantic_model.encode([r['document']])
            )[0][0]

            # Hit if semantic similarity >= threshold
            hit = 1 if sem_sim >= SEM_SIM_THRESHOLD else 0

            results.append({
                "Dataset": dataset_name,
                "Retriever": retriever_type,
                "Question": question,
                "Gold Answer": gold_answer,
                "Rank": rank,
                "Retrieved": r['document'][:200],
                "Score": score,
                "Hit": hit,
                "Semantic Sim": sem_sim
            })

# ================================
# Convert results to DataFrame
# ================================
df_results = pd.DataFrame(results)

# ================================
# Compute metrics per retriever
# ================================
metrics = []
for retriever_type in df_results["Retriever"].unique():
    df_r = df_results[df_results["Retriever"] == retriever_type]
    total_questions = df_r["Question"].nunique()
    hits_per_question = df_r.groupby("Question")["Hit"].max()

    precision = df_r["Hit"].sum() / len(df_r)
    recall = hits_per_question.sum() / total_questions
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    accuracy = recall
    avg_sem_sim = df_r["Semantic Sim"].mean()

    metrics.append({
        "Retriever": retriever_type,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "Accuracy": accuracy,
        "Avg Semantic Sim": avg_sem_sim
    })

df_metrics = pd.DataFrame(metrics)

print("\n📊 Retriever Metrics (F1 / Precision / Recall / Semantic Similarity):")
print(df_metrics)

# ================================
# Save results
# ================================
df_results.to_csv("retriever_results_semantic.csv", index=False)
df_metrics.to_csv("retriever_metrics_semantic.csv", index=False)
print("\n✅ Detailed results saved to CSV")


✅ Retrievers loaded from retrievers_v1.pkl


Evaluating queries:   0%|          | 4/2000 [00:21<2:54:19,  5.24s/it]

In [None]:
import pickle

# Load retrievers
with open("/content/retrievers_v1.pkl", "rb") as f:
    retrievers = pickle.load(f)

# Sample query
query = "What is machine learning?"

# Test each retriever
for name, retriever in retrievers.items():
    print(f"\n🔎 Testing retriever: {name}")
    try:
        results = retriever.retrieve(query)
        print(f"   Returned {len(results)} results")
        for r in results[:3]:  # show only top 3 results
            print("   ➜", r)
    except Exception as e:
        print(f"   ⚠️ Error running {name}: {e}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



🔎 Testing retriever: bm25
   Returned 5 results
   ➜ {'document': "All of Notre Dame's undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was established in 1962 to guide incoming freshmen in their first year at the school before they have declared a major. Each student is given an academic advisor from the program who helps them to choose classes that give them exposure to any major in which they are interested. The program also includes a Learning Resource Center which provides time management, collaborative learning, and subject tutoring. This program has been recognized previously, by U.S. News & World Report, as outstanding.", 'score': np.float64(5.987318973268757)}
   ➜ {'document': "All of Notre Dame's undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies p

In [None]:
# Compare scores for the top result of each retriever
for name, retriever in retrievers.items():
    results = retriever.retrieve(query)
    if results:
        top_score = results[0]['score']
        print(f"{name}: top result score = {top_score}")


bm25_v1: top result score = 13.621926586355245
dpr_v1: top result score = 0.506264865398407
hybrid_v1: top result score = 37.050485626545935


**LLM + Retriver**

In [None]:
# ================================
# Install packages
# ================================
!pip install sentence-transformers datasets scikit-learn nltk torch tqdm transformers rouge-score sacrebleu --quiet

import pickle, re, math
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from typing import List
from tqdm import tqdm
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
import sacrebleu

# NLTK setup
nltk.download('punkt')
nltk.download('stopwords')

# ================================
# Configuration
# ================================
DATASET_SELECTION = "v1"
#NUM_VALIDATION_EXAMPLES = 5   # small for testing
#TOP_K = 5
PICKLE_FILE = "retrievers_v1.pkl"
LLM_MODEL = "EleutherAI/gpt-neo-125M"
SEMANTIC_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
MAX_CONTEXT_TOKENS = 500  # limit tokens sent to LLM to avoid crashing

# ================================
# Text preprocessor (for BM25)
# ================================
class TextPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
    def preprocess_text(self, text: str) -> str:
        text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
        return ' '.join(text.split())
    def tokenize_and_stem(self, text: str):
        tokens = word_tokenize(self.preprocess_text(text))
        return [self.stemmer.stem(t) for t in tokens if t not in self.stop_words and len(t) > 2]

# ================================
# Retriever Classes
# ================================
class BM25Retriever:
    def __init__(self, k1=1.5, b=0.75):
        self.k1, self.b = k1, b
        self.preprocessor = TextPreprocessor()
    def fit(self, corpus: List[str]):
        self.corpus = corpus
        self.tokens = [self.preprocessor.tokenize_and_stem(doc) for doc in corpus]
        df = defaultdict(int)
        for tks in self.tokens:
            for tok in set(tks): df[tok] += 1
        self.idf = {tok: math.log((len(corpus)-f+0.5)/(f+0.5)+1) for tok,f in df.items()}
        self.doc_len = [len(tks) for tks in self.tokens]
        self.avgdl = np.mean(self.doc_len)
    def get_scores(self, query: str):
        q_toks = self.preprocessor.tokenize_and_stem(query)
        scores = np.zeros(len(self.corpus))
        for i, doc in enumerate(self.tokens):
            freqs = Counter(doc)
            for tok in q_toks:
                if tok in freqs:
                    tf, idf = freqs[tok], self.idf.get(tok,0)
                    denom = tf + self.k1*(1-self.b+self.b*self.doc_len[i]/self.avgdl)
                    scores[i]+=idf*(tf*(self.k1+1))/denom
        return scores
    def retrieve(self, query, k=3):
        s = self.get_scores(query)
        idx = np.argsort(s)[::-1][:k]
        return [{'document':self.corpus[i],'score':s[i]} for i in idx]

class DPRRetriever:
    def __init__(self, model_name="paraphrase-MiniLM-L3-v2", batch_size=16):
        self.model = SentenceTransformer(model_name)
        self.batch_size = batch_size
    def fit(self, corpus: List[str]):
        self.corpus = corpus
        all_embs = []
        for i in range(0, len(corpus), self.batch_size):
            batch = corpus[i:i+self.batch_size]
            all_embs.append(self.model.encode(batch, convert_to_tensor=True))
        self.embeddings = torch.cat(all_embs, dim=0)
    def retrieve(self, query, k=3):
        q_emb = self.model.encode([query], convert_to_tensor=True)
        sims = cosine_similarity(q_emb.cpu().numpy(), self.embeddings.cpu().numpy())[0]
        idx = np.argsort(sims)[::-1][:k]
        return [{'document':self.corpus[i],'score':sims[i]} for i in idx]

class HybridRetriever:
    def __init__(self, bm25: BM25Retriever, dpr: DPRRetriever, alpha: float = 0.5):
        self.bm25 = bm25
        self.dpr = dpr
        self.alpha = alpha
    def retrieve(self, query, k=3):
        bm25_res = self.bm25.retrieve(query, k=len(self.bm25.corpus))
        dpr_res = self.dpr.retrieve(query, k=len(self.dpr.corpus))
        combined_scores = {}
        for r in bm25_res:
            combined_scores[r['document']] = combined_scores.get(r['document'],0)+(1-self.alpha)*r['score']
        for r in dpr_res:
            combined_scores[r['document']] = combined_scores.get(r['document'],0)+self.alpha*r['score']
        sorted_docs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        return [{'document': doc, 'score': score} for doc, score in sorted_docs]

# ================================
# Load retrievers
# ================================
with open(PICKLE_FILE, "rb") as f:
    retrievers = pickle.load(f)
print(f"✅ Retrievers loaded from {PICKLE_FILE}")

# ================================
# Load validation data
# ================================
val_data = []
if DATASET_SELECTION in ["v1", "both"]:
    squad_v1_val = load_dataset("squad", split=f"validation[:{NUM_VALIDATION_EXAMPLES}]")
    val_data.extend([("v1", item) for item in squad_v1_val])

# ================================
# Load LLM
# ================================
print("Loading lightweight LLM (GPT-Neo 125M)...")
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
model = AutoModelForCausalLM.from_pretrained(LLM_MODEL, device_map="auto")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)  # CPU safe

def generate_answer(context: str, question: str):
    # truncate context to avoid token overflow
    words = context.split()[:MAX_CONTEXT_TOKENS]
    truncated_context = " ".join(words)
    prompt = f"Context: {truncated_context}\n\nQuestion: {question}\nAnswer:"
    out = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
    return out[0]['generated_text'].split("Answer:")[-1].strip()

# ================================
# Hallucination metric
# ================================
def hallucination_rate(answer: str, context: str):
    ans_tokens = set(answer.lower().split())
    ctx_tokens = set(context.lower().split())
    if len(ans_tokens) == 0: return 1.0
    return 1 - len(ans_tokens & ctx_tokens) / len(ans_tokens)

# ================================
# Load semantic similarity model
# ================================
semantic_model = SentenceTransformer(SEMANTIC_MODEL)
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# ================================
# Evaluation
# ================================
results = []

for dataset_name, example in tqdm(val_data, desc="Evaluating queries with LLM"):
    question = example['question']
    gold_answer = example['answers']['text'][0] if example['answers']['text'] else ""

    for retriever_type in ["bm25", "dpr", "hybrid"]:
        retriever_key = f"{retriever_type}_{dataset_name}"
        retriever = retrievers.get(retriever_key)
        if retriever:
            retrieved = retriever.retrieve(question, k=TOP_K)
            norm_scores = [r['score'] for r in retrieved]
            context = " ".join([r['document'] for r in retrieved])
            llm_answer = generate_answer(context, question)
            hall = hallucination_rate(llm_answer, context)

            # F1
            gold_tokens = set(gold_answer.lower().split())
            pred_tokens = set(llm_answer.lower().split())
            common = gold_tokens & pred_tokens
            f1 = 2*len(common)/(len(gold_tokens)+len(pred_tokens)) if (len(gold_tokens)+len(pred_tokens)>0) else 0.0

            # BLEU
            bleu = sacrebleu.corpus_bleu([llm_answer], [[gold_answer]]).score

            # ROUGE-L
            rouge_l = rouge.score(gold_answer, llm_answer)['rougeL'].fmeasure

            # Semantic similarity
            sem_sim = cosine_similarity(
                semantic_model.encode([gold_answer]),
                semantic_model.encode([llm_answer])
            )[0][0]

            hit = 1 if gold_answer.lower() in llm_answer.lower() else 0

            results.append({
                "Dataset": dataset_name,
                "Retriever": retriever_key + "+LLM",
                "Question": question,
                "Gold Answer": gold_answer,
                "Retrieved Context": context[:200],
                "LLM Answer": llm_answer[:200],
                "Score": np.mean(norm_scores),
                "Hit": hit,
                "Hallucination": hall,
                "F1": f1,
                "BLEU": bleu,
                "ROUGE-L": rouge_l,
                "Semantic Sim": sem_sim
            })

# ================================
# Convert results to DataFrame
# ================================
df_results = pd.DataFrame(results)
metrics = []

for retriever_key in df_results["Retriever"].unique():
    df_r = df_results[df_results["Retriever"] == retriever_key]
    total_questions = df_r["Question"].nunique()
    hits_per_question = df_r.groupby("Question")["Hit"].max()

    metrics.append({
        "Retriever": retriever_key,
        "Precision": df_r["Hit"].sum() / len(df_r),
        "Recall": hits_per_question.sum() / total_questions,
        "F1": df_r["F1"].mean(),
        "Accuracy": hits_per_question.sum() / total_questions,
        "Hallucination": df_r["Hallucination"].mean(),
        "BLEU": df_r["BLEU"].mean(),
        "ROUGE-L": df_r["ROUGE-L"].mean(),
        "Semantic Sim": df_r["Semantic Sim"].mean()
    })

df_metrics = pd.DataFrame(metrics)

print("\n📊 Retriever+LLM Metrics (Extended):")
print(df_metrics)

# ================================
# Save results
# ================================
df_results.to_csv("retriever_llm_results_light.csv", index=False)
df_metrics.to_csv("retriever_llm_metrics_light.csv", index=False)
print("\n✅ Results saved to CSV")
