# 02_vector_db_retrieval.ipynb

**Goal:** Pure Vector DB (FAISS) + KNN retrieval. Ingest PDFs and md/txt notes.

In [None]:
# %pip install pdfplumber sentence-transformers faiss-cpu numpy pandas
from pathlib import Path
import pdfplumber, re, unicodedata, json, numpy as np, faiss, pandas as pd
from sentence_transformers import SentenceTransformer

DATA = Path("data"); KNOW = Path("knowledge"); ART = Path("artifacts"); ART.mkdir(exist_ok=True)

def normalize_text(t: str) -> str:
    t = unicodedata.normalize("NFKC", t)
    t = t.replace("\xa0", " ")
    t = re.sub(r"-\n", "", t)
    t = re.sub(r"[ \t]+", " ", t)
    return t

def extract_pdf_text(p: Path) -> str:
    with pdfplumber.open(str(p)) as pdf:
        pages = [(pg.extract_text() or "") for pg in pdf.pages]
    return "\n\n".join(pages)

def chunk_words(text: str, size=900, overlap=150):
    toks = text.split()
    step = max(1, size-overlap)
    for i in range(0, len(toks), step):
        yield " ".join(toks[i:i+size])

docs = []
for p in sorted(DATA.glob("*.pdf")):
    raw = extract_pdf_text(p)
    norm = normalize_text(raw)
    for i, c in enumerate(chunk_words(norm)):
        docs.append({"id": f"{p.name}#c{i}", "text": c, "meta": {"source": p.name, "chunk": i}})

for p in sorted(list(KNOW.glob("*.md")) + list(KNOW.glob("*.txt"))):
    raw = normalize_text(p.read_text())
    for i, c in enumerate(chunk_words(raw)):
        docs.append({"id": f"{p.name}#c{i}", "text": c, "meta": {"source": p.name, "chunk": i}})

print("Total chunks:", len(docs))

In [None]:
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)
X = embedder.encode([d["text"] for d in docs], normalize_embeddings=True)
index = faiss.IndexFlatIP(X.shape[1]); index.add(np.array(X, dtype=np.float32))

# Persist
np.save("artifacts/embeddings.npy", X)
faiss.write_index(index, "artifacts/embeddings.faiss")
with open("artifacts/docs.jsonl", "w") as f:
    for d in docs: f.write(json.dumps(d)+"\n")
with open("artifacts/embed_model.json", "w") as f:
    json.dump({"model": model_name, "normalized": True}, f)

print("Indexed:", len(docs), "dim:", X.shape[1])

In [None]:
# KNN retrieval demo
def retrieve(query: str, k=6):
    q = embedder.encode([query], normalize_embeddings=True).astype("float32")
    D, I = index.search(q, k)
    return [{"score": float(D[0][j]), "doc": docs[i]} for j, i in enumerate(I[0])]

for q in ["KeyError target", "baseline models", "logistic regression"]:
    print("\n=== QUERY:", q, "===")
    for hit in retrieve(q, k=3):
        print(f"score={hit['score']:.3f}  source={hit['doc']['meta']['source']}  chunk={hit['doc']['meta']['chunk']}")