# 01 — Data Exploration & Ingestion Sanity Checks

This notebook walks through the CertiRAG ingestion pipeline and explores the
resulting chunk / span structures. Use it to:

1. Load documents and inspect chunking behaviour.
2. Visualise span boundaries inside each chunk.
3. Verify BM25 + dense index construction.
4. Spot-check retrieval quality on sample queries.

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))

from certirag.config import CertiRAGConfig, ExecutionMode
from certirag.ingest.chunker import DocumentChunker
from certirag.ingest.indexer import BM25Index, DenseIndex, ChunkStore
from certirag.schemas.evidence import EvidenceChunk, EvidenceSpan
from certirag.utils import set_all_seeds

import textwrap, json
from pprint import pprint

set_all_seeds(42)
cfg = CertiRAGConfig(execution_mode=ExecutionMode.LITE)
print(f"Mode: {cfg.execution_mode}  |  Chunk size: {cfg.chunk_size}  |  Overlap: {cfg.chunk_overlap}")

## 1. Chunking a sample document

In [None]:
SAMPLE_DOC = """
The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris.
It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
Locally nicknamed \"La dame de fer\", it was constructed from 1887 to 1889 as the centrepiece
of the 1889 World's Fair. Although initially criticised by some of France's leading artists
and intellectuals, it has become a global cultural icon of France and one of the most
recognisable structures in the world. The tower is 330 metres tall, about the same height
as an 81-storey building, and the tallest structure in Paris. Its base is square,
measuring 125 metres on each side.
""".strip()

chunker = DocumentChunker(config=cfg)
chunks = chunker.chunk_document(SAMPLE_DOC, doc_id="eiffel_tower", source="wikipedia")

print(f"Generated {len(chunks)} chunk(s)\n")
for i, chunk in enumerate(chunks):
    print(f"--- Chunk {i} ({chunk.chunk_id}) ---")
    print(f"  Text length : {len(chunk.text)} chars")
    print(f"  Spans       : {len(chunk.spans)}")
    for span in chunk.spans:
        print(f"    [{span.span_id}] ({span.start}:{span.end}) → {span.sentence[:60]}...")

## 2. Span boundary visualisation

We highlight each sentence span inside the chunk text using colour codes.

In [None]:
COLORS = ["\033[92m", "\033[94m", "\033[93m", "\033[95m", "\033[96m"]
RESET = "\033[0m"

def visualise_spans(chunk: EvidenceChunk) -> None:
    """Print chunk text with coloured span boundaries."""
    parts = []
    prev_end = 0
    for idx, span in enumerate(sorted(chunk.spans, key=lambda s: s.start)):
        if span.start > prev_end:
            parts.append(chunk.text[prev_end:span.start])
        colour = COLORS[idx % len(COLORS)]
        parts.append(f"{colour}[{span.span_id}|{chunk.text[span.start:span.end]}]{RESET}")
        prev_end = span.end
    if prev_end < len(chunk.text):
        parts.append(chunk.text[prev_end:])
    print("".join(parts))

for chunk in chunks:
    print(f"\n=== {chunk.chunk_id} ===")
    visualise_spans(chunk)

## 3. Index construction

In [None]:
# Build BM25 index from the chunks
store = ChunkStore()
for chunk in chunks:
    store.add(chunk)

bm25 = BM25Index()
bm25.build(chunks)

print(f"ChunkStore size : {len(store)}")
print(f"BM25 corpus size: {bm25.corpus_size}")

## 4. BM25 retrieval spot-check

In [None]:
queries = [
    "How tall is the Eiffel Tower?",
    "Who designed the Eiffel Tower?",
    "When was the tower built?",
]

for q in queries:
    results = bm25.search(q, top_k=3)
    print(f"\nQuery: {q}")
    for chunk_id, score in results:
        chunk = store.get(chunk_id)
        preview = chunk.text[:80] if chunk else "<missing>"
        print(f"  [{score:.3f}] {chunk_id}: {preview}...")

## 5. Schema serialisation round-trip

In [None]:
# Verify that chunks survive JSON serialisation
for chunk in chunks:
    serialised = chunk.model_dump_json()
    restored = EvidenceChunk.model_validate_json(serialised)
    assert restored.chunk_id == chunk.chunk_id
    assert len(restored.spans) == len(chunk.spans)
    assert restored.text == chunk.text

print("✅ All chunks survived serialisation round-trip")
print(f"\nSample JSON (first chunk):\n{json.dumps(json.loads(chunks[0].model_dump_json()), indent=2)[:500]}...")