In [None]:
# 📌 Cell 1: Imports & Setup
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Load FAISS index
INDEX_DIR = "./faiss_index"
EMBED_MODEL = "intfloat/e5-base-v2"
GEN_MODEL = "Qwen/Qwen2.5-14B"

retriever = FAISS.load_local(INDEX_DIR, HuggingFaceEmbeddings(model_name=EMBED_MODEL), allow_dangerous_deserialization=True)

# Load Qwen2.5 base model (not chat version)
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    GEN_MODEL,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()


In [None]:
# 📌 Cell 2: Helper — Format Prompt with Retrieved Docs
def build_prompt(question, docs, max_docs=3):
    context = "\n\n".join([f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(docs[:max_docs])])
    return f"""You are a historical researcher analyzing government records.

Use the following documents to answer the question.

{context}

Question: {question}
Answer:"""


In [None]:
# 📌 Cell 3: Ask a Question
def ask_question(question, k=5, max_tokens=512):
    # Retrieve docs
    docs = retriever.similarity_search(question, k=k)
    
    # Build prompt
    prompt = build_prompt(question, docs)
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate
    output = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    print("🧠 Model Answer:\n", answer[len(prompt):].strip())
