# 🧪 Minimal Agentic Chunking Notebook (Module-by-Module)

This lightweight notebook lets you test each step **individually**:
1) Extract blocks → 2) Build outline → 3) Plan chunks → 4) Synthesize chunks → 5) Embeddings → 6) Visualize bbox.

> Works **with** Azure OpenAI or **without** (falls back to simple heuristics).


## 1) Install dependencies

In [None]:
!pip install -q pymupdf openai>=1.50.0 pydantic matplotlib numpy

## 2) Configure environment

In [None]:
import os
os.environ.setdefault("AZURE_OPENAI_API_KEY", "")
os.environ.setdefault("AZURE_OPENAI_ENDPOINT", "https://<your-resource>.openai.azure.com")
os.environ.setdefault("AZURE_OPENAI_API_VERSION", "2024-08-01-preview")
os.environ.setdefault("AZURE_OPENAI_CHAT_DEPLOYMENT", "gpt-4o-mini")
os.environ.setdefault("AZURE_OPENAI_EMB_DEPLOYMENT", "text-embedding-3-large")
print("Endpoint:", os.environ.get("AZURE_OPENAI_ENDPOINT"))

## 3) Imports & basic types

In [None]:
import re, json, hashlib, textwrap
from dataclasses import dataclass, asdict
from typing import Any, Dict, List, Optional, Tuple

import fitz  # PyMuPDF
import numpy as np
import matplotlib.pyplot as plt

try:
    from openai import AzureOpenAI
    _openai_sdk = True
except Exception as e:
    print("OpenAI SDK import error:", e)
    _openai_sdk = False

from pydantic import BaseModel, Field

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview")
CHAT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT", "gpt-4o-mini")
EMB_DEPLOYMENT  = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT", "text-embedding-3-large")

client = None
if _openai_sdk and AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT:
    try:
        client = AzureOpenAI(api_key=AZURE_OPENAI_API_KEY, azure_endpoint=AZURE_OPENAI_ENDPOINT, api_version=AZURE_OPENAI_API_VERSION)
        print("Azure OpenAI client ready.")
    except Exception as e:
        print("Could not init Azure client:", e)

@dataclass
class Block:
    page: int
    text: str
    bbox_points: Tuple[float, float, float, float]
    page_size: Tuple[float, float]
    kind: str
    max_font: float
    is_ocr: bool = False

@dataclass
class Chunk:
    id: str
    section_path: List[str]
    role: str
    text: str
    summary: str
    keywords: List[str]
    page_start: int
    page_end: int
    bboxes_norm: List[List[float]]
    page_size: Dict[str, float]
    source_pdf: str
    source_anchor: str
    metadata: Dict[str, Any]
    embedding: Optional[List[float]] = None

HEADING_RX = re.compile(r"^\s*(\d+(\.\d+)*[\).\s-]+)?[A-Z].{0,80}$")

def classify_kind(txt: str, max_font: float) -> str:
    txt = (txt or "").strip()
    if not txt: return "para"
    if len(txt) < 120 and (HEADING_RX.match(txt) or max_font >= 14.5): return "heading"
    if txt.startswith(("-", "*", "•")): return "list"
    return "para"

def norm_bbox(bbox, w, h):
    x0,y0,x1,y1 = bbox
    return [x0/w, y0/h, x1/w, y1/h]

def hash_id(*parts: str) -> str:
    return hashlib.sha256("|".join(p[:512] for p in parts).encode()).hexdigest()[:16]

## 4) Module A — Extract blocks

In [None]:
PDF_PATH = "./your.pdf"  # 👈 change to your file path

def extract_blocks(pdf_path: str) -> List[Block]:
    doc = fitz.open(pdf_path)
    out: List[Block] = []
    for pno in range(len(doc)):
        page = doc[pno]
        w, h = page.rect.width, page.rect.height
        raw = page.get_text("rawdict")
        if not raw or not raw.get("blocks"):
            continue
        for b in raw["blocks"]:
            if "lines" not in b: 
                continue
            max_font = 0.0
            parts = []
            for line in b["lines"]:
                for span in line.get("spans", []):
                    parts.append(span.get("text",""))
                    try:
                        max_font = max(max_font, float(span.get("size", 0)))
                    except Exception:
                        pass
            txt = "\n".join(parts).strip()
            if not txt: 
                continue
            out.append(Block(page=pno+1, text=txt, bbox_points=tuple(b["bbox"]), page_size=(w,h), kind=classify_kind(txt, max_font), max_font=max_font))
    return out

blocks = extract_blocks(PDF_PATH)
print(f"Extracted blocks: {len(blocks)}")
if blocks:
    print("Sample -> page:", blocks[0].page, "| kind:", blocks[0].kind)
    print(blocks[0].text[:300], "...")

## 5) Module B — Build outline

In [None]:
def build_outline(blocks: List[Block]) -> List[Dict[str, Any]]:
    path: List[str] = []
    outline: List[Dict[str, Any]] = []
    for b in blocks:
        if b.kind == "heading":
            path = [*path, b.text][-6:]
        outline.append({"path": path[:], "block": b})
    return outline

outline = build_outline(blocks)
print("Outline entries:", len(outline))
if outline:
    print("First path:", " > ".join(outline[0]["path"]))

## 6) Module C — Plan chunks (LLM or fallback)

In [None]:
from pydantic import BaseModel, Field

class PlanItem(BaseModel):
    start_index: int = Field(..., ge=0)
    end_index: int   = Field(..., ge=0)
    role: str        = Field(...)
    title: str       = Field(...)

def plan_chunks(outline: List[Dict[str, Any]], max_preview: int = 900) -> List[PlanItem]:
    if client is None:
        chunks, start = [], 0
        acc_len = 0
        title = outline[start]["block"].text[:60] if outline else "Chunk"
        for i, row in enumerate(outline):
            acc_len += len(row["block"].text)
            if row["block"].kind == "heading" and i > start and acc_len > 1500:
                chunks.append(PlanItem(start_index=start, end_index=i-1, role="other", title=title))
                start, acc_len = i, len(row["block"].text)
                title = row["block"].text[:60]
        if outline:
            chunks.append(PlanItem(start_index=start, end_index=len(outline)-1, role="other", title=title))
        return chunks
    preview = []
    for i, row in enumerate(outline[:max_preview]):
        b: Block = row["block"]
        preview.append({"i": i, "page": b.page, "kind": b.kind, "text": b.text[:350]})
    system = "You are a precise editor. Emit coherent chunk boundaries for the sequence of blocks."
    user = {"rules": ["Merge short paragraphs", "Prefer ~1.5–2.5k chars per chunk",
                      "Return JSON list of {start_index,end_index,role,title}"], "preview": preview}
    res = client.chat.completions.create(
        model=CHAT_DEPLOYMENT, temperature=0,
        response_format={"type": "json_object"},
        messages=[{"role":"system","content":system},{"role":"user","content":json.dumps(user)}]
    )
    data = json.loads(res.choices[0].message.content)
    items = data.get("chunks") or data.get("results") or []
    return [PlanItem(**x) for x in items]

plan = plan_chunks(outline)
print("Planned chunks:", len(plan))
if plan:
    print("First plan:", plan[0].model_dump())

## 7) Module D — Synthesize chunks (summary & keywords)

In [None]:
def summarize_and_keywords(text: str):
    if client is None:
        summary = (text.strip().split(". ")[0] if text else "")[:400]
        kws = list({w.strip(',.;:()[]{}').lower() for w in text.split() if 3 <= len(w) <= 15})[:6]
        return summary, kws
    prompt = f"""Summarize in 1–2 sentences. Then a line 'Keywords:' with 3–8 terse keywords.\n\n{textwrap.shorten(text, width=4800, placeholder=' ...')}"""
    r = client.chat.completions.create(model=CHAT_DEPLOYMENT, temperature=0,
        messages=[{"role":"system","content":"Be concise."},{"role":"user","content":prompt}]
    ).choices[0].message.content or ""
    lines = [ln.strip() for ln in r.splitlines() if ln.strip()]
    summary = lines[0][:500] if lines else ""
    kws = []
    for ln in lines[::-1]:
        if ln.lower().startswith("keywords:"):
            kws = [k.strip().strip(',.;') for k in ln.split(':',1)[1].split(',') if k.strip()]
            break
    return summary, kws

def synthesize_chunks(pdf_path: str, outline: List[Dict[str, Any]], plan: List[PlanItem]) -> List[Chunk]:
    chunks: List[Chunk] = []
    for item in plan:
        seg = outline[item.start_index : item.end_index + 1]
        text_parts, pages, bboxes = [], [], []
        path = []
        ref_w = ref_h = None
        for row in seg:
            b: Block = row["block"]
            if b.kind == "heading":
                path.append(b.text)
            text_parts.append(b.text)
            pages.append(b.page)
            pw, ph = b.page_size
            if ref_w is None:
                ref_w, ref_h = pw, ph
            bboxes.append(norm_bbox(b.bbox_points, pw, ph))
        content = "\n".join(text_parts).strip()
        summary, keywords = summarize_and_keywords(content)
        cid = hash_id(pdf_path, str(min(pages) if pages else 1), item.title, content[:512])
        chunks.append(Chunk(
            id=cid, section_path=path[-4:] or ["Document"], role=item.role, text=content,
            summary=summary, keywords=keywords,
            page_start=min(pages) if pages else 1, page_end=max(pages) if pages else 1,
            bboxes_norm=bboxes, page_size={"width": ref_w or 0, "height": ref_h or 0},
            source_pdf=pdf_path, source_anchor=f"page={min(pages) if pages else 1}",
            metadata={"title": item.title, "pages": pages}
        ))
    return chunks

chunks = synthesize_chunks(PDF_PATH, outline, plan)
print("Synthesized chunks:", len(chunks))
if chunks:
    print("First chunk summary:", chunks[0].summary)

## 8) Module E — Embeddings (Azure or fallback)

In [None]:
def embed_text(text: str) -> List[float]:
    if client is None:
        h = hashlib.sha256(text.encode()).digest()
        arr = np.frombuffer(h, dtype=np.uint8).astype(np.float32)
        vec = (arr - arr.mean()) / (arr.std() + 1e-6)
        return vec.tolist()[:128]
    e = client.embeddings.create(model=EMB_DEPLOYMENT, input=text)
    return e.data[0].embedding

for c in chunks[:5]:
    comp = f"{c.text[:1500]}\n{' '.join(c.keywords)}\n{' / '.join(c.section_path)}"
    c.embedding = embed_text(comp)

print('First chunk embedding length:', len(chunks[0].embedding) if chunks else None)

## 9) Module F — Visualize a chunk's bounding boxes on the page

In [None]:
def show_chunk_overlay(pdf_path: str, chunk: Chunk, dpi: int = 150):
    doc = fitz.open(pdf_path)
    page_index = max(0, chunk.page_start - 1)
    page = doc[page_index]
    mat = fitz.Matrix(dpi/72, dpi/72)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
    fig, ax = plt.subplots(figsize=(10, 10 * pix.h / pix.w))
    ax.imshow(img); ax.axis('off')
    for bb in chunk.bboxes_norm:
        x0 = bb[0] * pix.w; y0 = bb[1] * pix.h
        x1 = bb[2] * pix.w; y1 = bb[3] * pix.h
        rect = plt.Rectangle((x0, y0), (x1-x0), (y1-y0), fill=False, linewidth=2)
        ax.add_patch(rect)
    plt.show()

if chunks:
    show_chunk_overlay(PDF_PATH, chunks[0], dpi=180)
else:
    print("No chunks to visualize.")