In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np

# Load dataset (5% sample from squad_v2)
dataset = load_dataset("squad_v2", split="train")
sample_size = int(len(dataset) * 0.05)
dataset_sample = dataset.shuffle(seed=42).select(range(sample_size))

contexts = dataset_sample["context"]
questions = dataset_sample["question"]
answers = dataset_sample["answers"]

# Embedding model (e5-small)
embedder = SentenceTransformer("intfloat/e5-small")
context_embeddings = embedder.encode(
    contexts,
    batch_size=16,
    convert_to_numpy=True,
    show_progress_bar=True
)

dim = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(context_embeddings)

# Generator model (flan-t5-small)
generator = pipeline("text2text-generation", model="google/flan-t5-small")

def rag_answer(question, top_k=3):
    q_emb = embedder.encode([question])
    D, I = index.search(q_emb, top_k)
    retrieved = [contexts[int(i)] for i in I[0]]
    prompt = question + " \n\n" + " ".join(retrieved)
    answer = generator(prompt, max_length=128, num_return_sequences=1)[0]['generated_text']
    return answer, retrieved

# Example query
q = "Who wrote the Declaration of Independence?"
ans, ctxs = rag_answer(q)

print("Q:", q)
print("Answer:", ans)
print("Retrieved Contexts:", ctxs)


### Initial PoC Results (5% Sample, e5-small + flan-t5-small)

- **Query:** "Who wrote the Declaration of Independence?"  
- **Answer:** *James Madison*  
- **Retrieved Contexts:** The top-k results mostly contained passages about James Madison, not Thomas Jefferson.  

**Interpretation**  
- The pipeline works end-to-end: embedding → vector search → generation.  
- However, the retrieved contexts did not include Jefferson, so the generator produced an incorrect answer (Madison).  
- This shows a limitation in retrieval quality rather than in the generator itself.  

**Implications**  
- The initial RAG PoC is functional but highlights the need for accuracy improvements.  
- Potential next steps include:  
  - Tuning retrieval parameters (e.g., increasing *k*)  
  - Trying different FAISS index types  
  - Adding a reranker to improve context selection  
  - Prompt engineering for better grounding in retrieved passages  

👉 This result should be recorded in `docs/initial_poc_results.md` as part of the **“Vector Search + Top-k Initial Report.”**
