In [1]:
"""
Demonstrate how chunk_size affects retrieval.
NOW includes score interpretation ranges and gap analysis.

Score Ranges:
0.75 – 1.0   -> Very strong semantic match
0.60 – 0.75  -> Good match
0.45 – 0.60  -> Weak/moderate
< 0.40       -> Probably irrelevant
"""

import math
import logging
from collections import Counter

# ---------------- LOGGING ----------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
log = logging.getLogger("chunking-eval")

# ---------------- SAMPLE DATA ----------------
DOC = (
    "MCP lets models call tools. Tools can be databases, filesystems, or APIs.\n"
    "When you chunk documents poorly, retrieval returns incomplete context.\n"
    "Smaller chunks improve precision but may lose context.\n"
    "Larger chunks preserve context but may reduce recall.\n"
    "Overlap can help continuity between chunks.\n"
)

QUERY = "How does chunk size affect retrieval context?"

# ---------------- SCORE INTERPRETATION ----------------
def interpret_score(score: float) -> str:
    if score >= 0.75:
        return "Very strong semantic match"
    elif score >= 0.60:
        return "Good match"
    elif score >= 0.45:
        return "Weak / moderate match"
    else:
        return "Probably irrelevant"

# ---------------- CHUNKING ----------------
def chunk_text(text: str, chunk_size: int, overlap: int):
    words = text.split()
    chunks = []
    i = 0

    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += max(1, chunk_size - overlap)

    return chunks

# ---------------- VECTOR ----------------
def bow_vector(text: str) -> Counter:
    return Counter(w.lower().strip(".,!?") for w in text.split())

def cosine(a: Counter, b: Counter) -> float:
    dot = sum(a[k] * b.get(k, 0) for k in a)
    na = math.sqrt(sum(v*v for v in a.values()))
    nb = math.sqrt(sum(v*v for v in b.values()))
    return dot / (na * nb + 1e-9)

# ---------------- RETRIEVAL ----------------
def top_chunks(chunks, query, k=3):
    qv = bow_vector(query)
    scored = []

    for idx, c in enumerate(chunks):
        cv = bow_vector(c)
        score = cosine(cv, qv)
        scored.append((score, idx + 1, c))

    scored.sort(reverse=True, key=lambda x: x[0])
    return scored[:k]

# ---------------- RUN EXPERIMENT ----------------
def run(chunk_size: int, overlap: int):
    print("\n" + "=" * 70)
    print(f"chunk_size={chunk_size}, overlap={overlap}")

    chunks = chunk_text(DOC, chunk_size, overlap)
    print(f"Total chunks created: {len(chunks)}")

    hits = top_chunks(chunks, QUERY, k=3)

    print("\nTop Matches:")
    for score, idx, chunk in hits:
        interpretation = interpret_score(score)
        print(f"\nChunk #{idx}")
        print(f"Score: {score:.3f}")
        print(f"Interpretation: {interpretation}")
        print(f"Text: {chunk}")

    # Score gap analysis
    if len(hits) >= 2:
        gap = hits[0][0] - hits[1][0]
        print(f"\nScore Gap (Top1 - Top2): {gap:.3f}")
        if gap > 0.15:
            print("→ Strong separation (good chunk discrimination)")
        elif gap > 0.05:
            print("→ Moderate separation")
        else:
            print("→ Weak separation (chunking may need tuning)")

# ---------------- MAIN ----------------
def main():
    run(chunk_size=10, overlap=2)
    run(chunk_size=25, overlap=5)
    run(chunk_size=40, overlap=10)

if __name__ == "__main__":
    main()


chunk_size=10, overlap=2
Total chunks created: 6

Top Matches:

Chunk #2
Score: 0.239
Interpretation: Probably irrelevant
Text: databases, filesystems, or APIs. When you chunk documents poorly, retrieval

Chunk #3
Score: 0.239
Interpretation: Probably irrelevant
Text: poorly, retrieval returns incomplete context. Smaller chunks improve precision but

Chunk #4
Score: 0.202
Interpretation: Probably irrelevant
Text: precision but may lose context. Larger chunks preserve context but

Score Gap (Top1 - Top2): 0.000
→ Weak separation (chunking may need tuning)

chunk_size=25, overlap=5
Total chunks created: 3

Top Matches:

Chunk #1
Score: 0.218
Interpretation: Probably irrelevant
Text: MCP lets models call tools. Tools can be databases, filesystems, or APIs. When you chunk documents poorly, retrieval returns incomplete context. Smaller chunks improve precision

Chunk #2
Score: 0.182
Interpretation: Probably irrelevant
Text: context. Smaller chunks improve precision but may lose context. La