In [39]:
print("hi")



hi


In [40]:
import re
import fitz  # PyMuPDF
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional


# ---------------------------------------------------------------------------
# Data models
# ---------------------------------------------------------------------------

@dataclass
class PageContent:
    """Raw content extracted from a single PDF page."""
    page_number: int          # 1-based
    raw_text: str
    headings: list[str] = field(default_factory=list)


@dataclass
class ParsedDocument:
    """Full parsed representation of a PDF document."""
    document_id: str          # e.g. SHA-256 hash or a UUID
    document_name: str        # original filename
    file_path: str
    total_pages: int
    pages: list[PageContent] = field(default_factory=list)


# ---------------------------------------------------------------------------
# Heading detection
# ---------------------------------------------------------------------------

# Patterns that suggest a line is a section heading in legal documents.
_HEADING_PATTERNS = [
    # ALL-CAPS lines (common in contracts: "TERMINATION", "REPRESENTATIONS")
    re.compile(r"^[A-Z][A-Z\s\-&,\.]{4,}$"),
    # Numbered sections: "1.", "1.1", "Section 2", "Article IV"
    re.compile(r"^(?:Section|Article|Clause|Schedule|Exhibit|Annex)?\s*\d+(?:\.\d+)*[\.\)]\s+\S", re.I),
    # Roman numeral headings: "IV. Obligations"
    re.compile(r"^[IVXLCDM]+\.\s+\S", re.I),
    # Title-case lines that are short (≤ 80 chars) and end without a period
    re.compile(r"^[A-Z][A-Za-z\s]{3,79}[^.]$"),
]


def _detect_headings(text: str) -> list[str]:
    """Return a list of lines from *text* that look like section headings."""
    headings = []
    for line in text.splitlines():
        line = line.strip()
        if not line or len(line) > 120:
            continue
        for pattern in _HEADING_PATTERNS:
            if pattern.match(line):
                headings.append(line)
                break
    return headings


# ---------------------------------------------------------------------------
# Core parser
# ---------------------------------------------------------------------------

class DocumentParser:
    """
    Parse a PDF file and return a :class:`ParsedDocument` with per-page
    text and detected headings.

    Usage::

        parser = DocumentParser()
        doc = parser.parse("contracts/Lease_Agreement_2024.pdf")
        for page in doc.pages:
            print(page.page_number, page.headings)
    """

    def __init__(self, detect_headings: bool = True):
        self.detect_headings = detect_headings

    # ------------------------------------------------------------------
    # Public API
    
    # ------------------------------------------------------------------

    def parse(
        self,
        file_path: str | Path,
        document_id: Optional[str] = None,
    ) -> ParsedDocument:
        """
        Parse *file_path* and return a fully-populated :class:`ParsedDocument`.

        Args:
            file_path: Path to the PDF file.
            document_id: Optional stable identifier. Defaults to the stem of
                         the filename if not provided.

        Returns:
            ParsedDocument with one :class:`PageContent` entry per page.

        Raises:
            FileNotFoundError: If the file does not exist.
            ValueError: If the file is not a valid PDF.
        """
        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {path}")

        doc_name = path.name
        doc_id = document_id or path.stem

        pages: list[PageContent] = []

        try:
            pdf = fitz.open(str(path))
        except Exception as exc:
            raise ValueError(f"Could not open PDF '{path}': {exc}") from exc

        with pdf:
            total_pages = pdf.page_count
            for page_index in range(total_pages):
                page = pdf[page_index]
                page_number = page_index + 1  # convert to 1-based

                # Extract text – "text" mode gives plain UTF-8 output
                raw_text = page.get_text("text")
                raw_text = self._clean_text(raw_text)

                headings = (
                    _detect_headings(raw_text) if self.detect_headings else []
                )

                pages.append(
                    PageContent(
                        page_number=page_number,
                        raw_text=raw_text,
                        headings=headings,
                    )
                )

        return ParsedDocument(
            document_id=doc_id,
            document_name=doc_name,
            file_path=str(path.resolve()),
            total_pages=total_pages,
            pages=pages,
        )

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _clean_text(text: str) -> str:
        """
        Light normalisation:
          - Remove null bytes and form-feeds.
          - Collapse runs of more than two blank lines.
          - Strip trailing whitespace from each line.
        """
        text = text.replace("\x00", "").replace("\f", "\n")
        text = "\n".join(line.rstrip() for line in text.splitlines())
        # Collapse 3+ consecutive blank lines into 2
        text = re.sub(r"\n{3,}", "\n\n", text)
        return text.strip()


# ---------------------------------------------------------------------------
# Convenience function
# ---------------------------------------------------------------------------

def parse_pdf(
    file_path: str | Path,
    document_id: Optional[str] = None,
) -> ParsedDocument:
    """
    Module-level shortcut for one-off parsing.

    Example::

        from services.rag.document_parser import parse_pdf


        doc = parse_pdf("contracts/Lease_Agreement_2024.pdf")
        print(doc.total_pages)
        print(doc.pages[0].headings)
    """
    return DocumentParser().parse(file_path, document_id=document_id)



In [41]:
import sys
from pathlib import Path
import os

PROJECT_ROOT = Path("/home/bota/personal/edge_computing")
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.chdir(PROJECT_ROOT)

from gateway.core.config import settings
print("Base Dir:", PROJECT_ROOT)
print("Data Path:", settings.DATA_PATH)
print("Is Debug True?", settings.DEBUG)
print("VLLM API URL loaded?", bool(settings.VLLM_API_URL))


Base Dir: /home/bota/personal/edge_computing
Data Path: /home/bota/personal/edge_computing/gateway/data
Is Debug True? False
VLLM API URL loaded? True


In [42]:
file_path = Path(settings.DATA_PATH) / "docs" / "lease_agreement.pdf"
print("Constructed file path:", file_path)

Constructed file path: /home/bota/personal/edge_computing/gateway/data/docs/lease_agreement.pdf


In [43]:
parse_pdf = parse_pdf(file_path=file_path, document_id="lease_agreement_2024")

In [44]:
print("Parsed document ID:", parse_pdf.document_id)
print("Document name:", parse_pdf.document_name)
print("Total pages:", parse_pdf.total_pages)
print("Headings on first page:", parse_pdf.pages[3].headings)


Parsed document ID: lease_agreement_2024
Document name: lease_agreement.pdf
Total pages: 9
Headings on first page: ['7.  LANDLORD’S OTHER RESPONSIBILITIES', '8.  TENANT’S OTHER RESPONSIBILITIES', 'Initials', 'Initials']


In [45]:
# services/rag/agreement_chunker.py

from __future__ import annotations
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat, ConversionStatus
from docling.chunking import HybridChunker
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer


# -----------------------------
# Configuration (Agreement Only)
# -----------------------------

@dataclass
class AgreementChunkConfig:
    max_tokens: int = 722
    overlap_tokens: int = 100
    min_chunk_words: int = 30
    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"


@dataclass
class Chunk:
    chunk_id: str
    text: str
    metadata: dict


# -----------------------------
# Agreement Chunker
# -----------------------------

class AgreementChunker:

    def __init__(self, config: Optional[AgreementChunkConfig] = None):
        self.config = config or AgreementChunkConfig()

        self._converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=PdfPipelineOptions(
                        do_ocr=False,
                        do_table_structure=False,
                        generate_page_images=False,
                    )
                )
            }
        )

        tokenizer = HuggingFaceTokenizer(
            tokenizer=AutoTokenizer.from_pretrained(self.config.embedding_model),
            max_tokens=self.config.max_tokens,
        )

        self._chunker = HybridChunker(
            tokenizer=tokenizer,
            merge_peers=True,
        )

    def chunk_pdf(self, path: str | Path) -> List[Chunk]:
        path = str(path)
        doc_name = Path(path).name
        doc_id = Path(path).stem

        result = self._converter.convert(path)

        if result.status == ConversionStatus.FAILURE:
            raise RuntimeError("PDF conversion failed")

        dl_doc = result.document
        raw_chunks = list(self._chunker.chunk(dl_doc))

        chunks = []
        for idx, raw in enumerate(raw_chunks):
            text = raw.text.strip()
            if not text:
                continue

            # Basic page detection
            page_number = None
            doc_items = getattr(raw.meta, "doc_items", [])
            if doc_items:
                prov = getattr(doc_items[0], "prov", [])
                if prov:
                    page_number = getattr(prov[0], "page_no", None)

            headings = getattr(raw.meta, "headings", []) or []
            section_title = " > ".join(headings) if headings else "Unknown Section"

            if len(text.split()) < self.config.min_chunk_words:
                continue

            chunks.append(
                Chunk(
                    chunk_id=f"{doc_id}_{uuid.uuid4().hex[:8]}",
                    text=text,
                    metadata={
                        "document_id": doc_id,
                        "document_name": doc_name,
                        "page_number": page_number,
                        "section_title": section_title,
                        "chunk_index": idx,
                    },
                )
            )

        return chunks

In [46]:
# services/rag/agreement_indexer.py

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from typing import List



# ----------------------------------
# Load embedding model ONCE
# ----------------------------------

embedding_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2"
)

dimension = embedding_model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(dimension)

metadata_store = []


# ----------------------------------
# Ingest Agreement
# ----------------------------------

def ingest_agreement(pdf_path: str):

    chunker = AgreementChunker()
    chunks = chunker.chunk_pdf(pdf_path)

    texts = [c.text for c in chunks]
    embeddings = embedding_model.encode(texts, convert_to_numpy=True)

    index.add(embeddings)
    metadata_store.extend(chunks)

    print(f"Ingested {len(chunks)} chunks\nembeddings shape: {embeddings.shape}")


# ----------------------------------
# Retrieval
# ----------------------------------

def search(query: str, top_k: int = 5):

    query_embedding = embedding_model.encode([query], convert_to_numpy=True)

    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        results.append(metadata_store[idx])

    return results

In [47]:
chunker = AgreementChunker()
chunks = chunker.chunk_pdf(file_path)
print(f"Generated {len(chunks)} chunks from the agreement.")
print("Sample chunk metadata:", chunks[0].metadata)


Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors


Generated 20 chunks from the agreement.
Sample chunk metadata: {'document_id': 'lease_agreement', 'document_name': 'lease_agreement.pdf', 'page_number': 1, 'section_title': 'LEASE AGREEMENT - RESIDENTIAL', 'chunk_index': 0}


In [48]:
ingested = ingest_agreement(file_path)


print(f"ingest agreement: {ingest_agreement(file_path)}")


Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors


Ingested 20 chunks
embeddings shape: (20, 384)


Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors


Ingested 20 chunks
embeddings shape: (20, 384)
ingest agreement: None


In [50]:
results = search("What is the doc about?")

for r in results:
    print(r.metadata["section_title"])
    print(r.metadata["page_number"])
    print(r.text[:300])
    print(r)

LEASE AGREEMENT - RESIDENTIAL
1
This is a written contract that sets out the terms and conditions between the Landlord and Tenant of a residential property.
(the address acts as the domicilium citandi et executandi)
(the address acts as the domicilium citandi et executandi)
LEASE AGREEMENT - RESIDENTIAL
1
This is a written contract that sets out the terms and conditions between the Landlord and Tenant of a residential property.
(the address acts as the domicilium citandi et executandi)
(the address acts as the domicilium citandi et executandi)
DISCLAIMER
9
LAW	FOR	ALL	cares	about	the	legal	rights	of	South	Africans	and	have	made	it	our	goal	to	make	the	law	affordable	and	accessible	to	all.	This	contract template	has	been	designed	with	you	and	protection	of	your	rights	in	mind.	Although	we	have	taken	every	care	to	ensure	that	this	document	is	accurate	a
DISCLAIMER
9
LAW	FOR	ALL	cares	about	the	legal	rights	of	South	Africans	and	have	made	it	our	goal	to	make	the	law	affordable	and	accessi