# Vector Databases and Retrieval (RAG) - Interactive Notebook

This notebook provides hands-on examples to accompany the vector retrieval lesson:
- Generate embeddings and chunk text
- Build and persist Chroma and FAISS indexes
- Use retrievers (similarity, MMR, threshold)
- Hybrid search with BM25 + vectors via RRF
- Minimal RAG chain with citations
- Quick evaluation of retrieval quality

Ensure your `.env` has OPENAI_API_KEY for embeddings/LLM, and you have `chromadb` and `faiss-cpu` installed.

## 0) Setup and Imports

In [None]:
import os
import json
from collections import defaultdict
from pathlib import Path
from typing import List, Dict, Any

import numpy as np

from dotenv import load_dotenv
load_dotenv()

# LangChain core
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.chains import LLMChain

# Models & embeddings
try:
    from langchain_openai import ChatOpenAI, OpenAIEmbeddings
    openai_available = True
except Exception as e:
    print("OpenAI integrations not available:", e)
    openai_available = False

# Vector stores
from langchain_community.vectorstores import Chroma, FAISS

# Text splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter, TokenTextSplitter

# BM25 retriever (for hybrid)
try:
    from langchain_community.retrievers import BM25Retriever
    bm25_available = True
except Exception as e:
    print("BM25 retriever not available:", e)
    bm25_available = False

print("Setup complete.")

## 1) Embeddings (OpenAI) - Quick Demo

In [None]:
embeddings = None
if openai_available:
    try:
        embeddings = OpenAIEmbeddings()
        vec = embeddings.embed_query("LangChain makes RAG easy")
        print("Embedding length:", len(vec), "sample:", vec[:5])
    except Exception as e:
        print("Failed to initialize embeddings (check OPENAI_API_KEY):", e)
else:
    print("OpenAI embeddings unavailable. Continuing with code that does not require embeddings.")

## 2) Chunking Strategies (Recursive and Token-based)

In [None]:
text = (
    "LangChain is a framework for developing applications powered by LLMs. "
    "It provides prompts, chains, agents, memory, and integrations. "
    "Vector databases enable semantic search via embeddings."
)

# Recursive splitter
recursive = RecursiveCharacterTextSplitter(
    chunk_size=400, chunk_overlap=60, separators=["\n\n", "\n", " ", ""]
)
rec_chunks = recursive.split_text(text)
print("Recursive chunks:", len(rec_chunks))
for i, ch in enumerate(rec_chunks):
    print(f"{i+1}: {len(ch)} chars")

# Token-based splitter
try:
    token_splitter = TokenTextSplitter(chunk_size=80, chunk_overlap=20, encoding_name="cl100k_base")
    tok_chunks = token_splitter.split_text(text)
    print("Token chunks:", len(tok_chunks))
    for i, ch in enumerate(tok_chunks):
        print(f"{i+1}: {len(ch)} chars")
except Exception as e:
    print("TokenTextSplitter unavailable:", e)

## 3) Build a Persistent Chroma Index

In [None]:
persist_dir = "./chroma_rag"
Path(persist_dir).mkdir(exist_ok=True)

# Sample docs
docs = [
    Document(page_content="LangChain supports prompts, chains, agents, and memory.",
             metadata={"source": "notes", "topic": "langchain"}),
    Document(page_content="Vector databases enable semantic search via embeddings.",
             metadata={"source": "notes", "topic": "vectors"}),
    Document(page_content="RAG combines retrieval with generation to ground responses in data.",
             metadata={"source": "notes", "topic": "rag"}),
]

vectordb = None
if embeddings is not None:
    try:
        vectordb = Chroma(
            collection_name="demo",
            embedding_function=embeddings,
            persist_directory=persist_dir
        )
        vectordb.add_documents(docs)
        vectordb.persist()
        try:
            cnt = vectordb._collection.count()
            print("Chroma collection count:", cnt)
        except Exception:
            print("Chroma persisted.")
    except Exception as e:
        print("Chroma error:", e)
else:
    print("Skipping Chroma build (no embeddings).")

### Metadata Filtering with Chroma (if available)

In [None]:
if vectordb is not None:
    try:
        results = vectordb.similarity_search("semantic search", k=3, filter={"topic": "vectors"})
        print("Filtered results:")
        for d in results:
            print(d.metadata, "|", d.page_content[:60])
    except Exception as e:
        print("Filtering error:", e)
else:
    print("Vectordb not initialized.")

## 4) FAISS: Build, Search, and Persist/Load (In-memory index)

In [None]:
faiss_db = None
if embeddings is not None:
    try:
        faiss_db = FAISS.from_documents(docs, embeddings)
        hits = faiss_db.similarity_search("What enables semantic search?", k=2)
        print("FAISS hits:")
        for h in hits:
            print(h.metadata, "|", h.page_content[:60])
        # Persist and load
        faiss_db.save_local("./faiss_demo")
        faiss_loaded = FAISS.load_local("./faiss_demo", embeddings, allow_dangerous_deserialization=True)
        print("FAISS loaded; test query count:", len(faiss_loaded.similarity_search("semantic", k=2)))
    except Exception as e:
        print("FAISS error:", e)
else:
    print("Skipping FAISS (no embeddings).")

## 5) Retriever Strategies: Similarity, MMR, Threshold (Chroma)

In [None]:
sim_retriever = None
mmr_retriever = None
thr_retriever = None

if vectordb is not None:
    try:
        sim_retriever = vectordb.as_retriever(search_kwargs={"k": 4})
        sim_docs = sim_retriever.get_relevant_documents("What does LangChain provide?")
        print("Similarity retriever results:")
        for d in sim_docs:
            print("-", d.page_content)
    except Exception as e:
        print("Similarity retriever error:", e)

    try:
        mmr_retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5, "lambda_mult": 0.5})
        mmr_docs = mmr_retriever.get_relevant_documents("semantic search and vectors")
        print("\nMMR retriever results:")
        for d in mmr_docs:
            print("-", d.page_content)
    except Exception as e:
        print("MMR retriever error:", e)

    try:
        thr_retriever = vectordb.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"score_threshold": 0.2, "k": 8}
        )
        thr_docs = thr_retriever.get_relevant_documents("obscure topic that might not exist")
        print("\nThreshold retriever results (may be empty):", len(thr_docs))
    except Exception as e:
        print("Threshold retriever error:", e)
else:
    print("Chroma not available; skipping retriever demos.")

## 6) Hybrid Search: BM25 + Vectors via RRF (Reciprocal Rank Fusion)

In [None]:
def rrf(ranks: List[int], k: int = 60) -> float:
    return sum(1.0 / (k + r) for r in ranks)

hybrid_enabled = bm25_available and (vectordb is not None)

if hybrid_enabled:
    # Build BM25 from our docs
    bm25 = BM25Retriever.from_texts([d.page_content for d in docs])
    bm25.k = 4

    def hybrid_search(query: str, top_k: int = 5):
        bm25_hits = bm25.get_relevant_documents(query)
        vec_hits = vectordb.similarity_search_with_score(query, k=top_k)

        rank_map = defaultdict(list)
        for i, d in enumerate(bm25_hits):
            rank_map[hash(d.page_content)].append(i + 1)
        for j, (d, _) in enumerate(vec_hits):
            rank_map[hash(d.page_content)].append(j + 1)

        scored = []
        for key, ranks in rank_map.items():
            scored.append((rrf(ranks), key))
        scored.sort(reverse=True)

        # Recover original docs
        needle = {hash(d.page_content): d for d in [*bm25_hits, *[d for d, _ in vec_hits]]}
        return [needle[h] for _, h in scored[:top_k]]

    combo = hybrid_search("semantic search")
    print("Hybrid search results:")
    for d in combo:
        print("-", d.page_content)
else:
    print("Hybrid search disabled (BM25 or Chroma missing).")

## 7) Minimal RAG Chain with Citations (if LLM available)

In [None]:
llm = None
if openai_available:
    try:
        llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
    except Exception as e:
        print("ChatOpenAI unavailable:", e)

rag_prompt = PromptTemplate(
    template=(
        """You are a helpful assistant. Answer only using the provided context. 
If the answer is not in the context, say 'I don't know'.

Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
    ),
    input_variables=["context", "question"],
)

def answer(question: str, retriever=sim_retriever) -> Dict[str, Any]:
    if retriever is None:
        return {"error": "retriever not available"}
    ctx_docs = retriever.get_relevant_documents(question)
    context = "\n\n".join(d.page_content for d in ctx_docs)
    if llm is None:
        return {"context": context, "note": "LLM unavailable; showing retrieved context only."}
    chain = LLMChain(llm=llm, prompt=rag_prompt)
    out = chain.run(context=context, question=question)
    return {"answer": out, "citations": [d.metadata for d in ctx_docs]}

print(answer("What components does LangChain provide?"))

## 8) Quick Retrieval Evaluation (Hit Rate@k)

In [None]:
gold = {
    "What is LangChain?": "LangChain supports prompts, chains, agents, and memory.",
    "How do we search semantically?": "Vector databases enable semantic search via embeddings.",
}

def hit_rate(retriever, k: int = 3) -> float:
    if retriever is None:
        return 0.0
    hits = 0
    for q, must_have in gold.items():
        docs = retriever.get_relevant_documents(q)
        snapshot = "\n".join(d.page_content for d in docs[:k])
        if must_have.lower() in snapshot.lower():
            hits += 1
    return hits / len(gold)

print("Similarity@3:", hit_rate(sim_retriever, k=3))
print("MMR@3:", hit_rate(mmr_retriever, k=3))

## 9) Notes on Persistence and Freshness
- Chroma: call `persist()` after writes; reopen with the same `persist_directory`.
- FAISS: use `save_local` and `load_local`.
- Use stable document IDs and upsert semantics to avoid duplicates.
- Track content hashes to detect when to rebuild chunks.
- Keep chunk sizes between ~200–1000 tokens; use overlaps when necessary.