# 01 — Ingest + Chunk + Index

Step-by-step TF‑IDF index build.

In [None]:
from pathlib import Path
import re
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [(p.name, p.read_text(encoding="utf-8", errors="ignore")) for p in Path("data/docs").glob("*.txt")]
docs


In [None]:
def chunk_text(text: str, max_chars=700, overlap=80):
    parts = re.split(r"\n\s*\n", text.strip())
    chunks, buf = [], ""
    for part in parts:
        part = part.strip()
        if not part: 
            continue
        if len(buf)+len(part)+2 <= max_chars:
            buf = (buf + "\n\n" + part).strip()
        else:
            if buf: chunks.append(buf)
            while len(part) > max_chars:
                chunks.append(part[:max_chars])
                part = part[max_chars-overlap:]
            buf = part
    if buf: chunks.append(buf)
    return chunks

chunks = []
for doc_id, text in docs:
    for i, ch in enumerate(chunk_text(text)):
        chunks.append({"doc_id": doc_id, "chunk_id": f"{doc_id}::chunk{i}", "text": ch})

len(chunks), chunks[0]


In [None]:
vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=1)
X = vec.fit_transform([c["text"] for c in chunks])
X.shape
