# Hybrid MD&A Extraction (Indian Annual Reports)

This notebook extracts the **Management Discussion & Analysis (MD&A)** section from Indian Annual Report PDFs using a **hybrid parsing pipeline**:

- **ToC/Index pages**: parsed with `pdfplumber` (layout-aware) to correctly bind section titles ↔ page numbers.
- **Body pages**: extracted with `PyMuPDF` (`fitz`) for speed.

Output: a CSV with columns `Filename, Company, Year, MD&A_Text`.

In [2]:
import re
import logging
import pathlib
from dataclasses import dataclass
from typing import Iterable, Optional

import fitz  # PyMuPDF
import pdfplumber
import pandas as pd
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('mdna')

# Notebook-safe project root resolution
CWD = pathlib.Path.cwd().resolve()
PROJECT_ROOT = CWD.parent if CWD.name.lower() == 'notebooks' else CWD

PDF_ROOT = PROJECT_ROOT / 'data' / 'pdfs'
OUTPUT_DIR = PROJECT_ROOT / 'output'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Use a different output name so this notebook doesn't overwrite older runs
CSV_PATH = OUTPUT_DIR / 'mdna_extracted_hybrid.csv'

# Tune these if needed
TOC_SCAN_PAGES = 10
HEADER_SCAN_PAGES_FALLBACK = None  # None = scan full document
HEADER_REGION_MAX_Y = 120  # points (roughly top-of-page header)

WHITESPACE_RE = re.compile(r'\s+')

In [2]:
def _clean_text(s: str) -> str:
    s = (s or '').replace('\u00a0', ' ').replace('\t', ' ')
    s = WHITESPACE_RE.sub(' ', s).strip()
    return s


def _normalize_title(s: str) -> str:
    s = _clean_text(s)
    # normalize apostrophes / ampersands for matching
    s = s.replace('’', "'")
    s = s.replace('&', ' & ')
    s = WHITESPACE_RE.sub(' ', s).strip()
    return s


def _extract_page_text_fitz_blocks(page: fitz.Page) -> str:
    """More stable than raw `get_text()` for multi-column pages."""
    blocks = page.get_text('blocks') or []
    # block tuple: (x0, y0, x1, y1, text, block_no, block_type)
    blocks_sorted = sorted(blocks, key=lambda b: (round(b[1], 1), round(b[0], 1)))
    parts = []
    for b in blocks_sorted:
        txt = (b[4] or '').strip()
        if txt:
            parts.append(txt)
    return _clean_text('\n'.join(parts))


def _header_text_fitz(page: fitz.Page, max_y: float = HEADER_REGION_MAX_Y) -> str:
    blocks = page.get_text('blocks') or []
    header_parts = []
    for b in blocks:
        y0 = b[1]
        if y0 <= max_y:
            t = (b[4] or '').strip()
            if t:
                header_parts.append(t)
    return _clean_text(' '.join(header_parts))


def _cluster_words_into_lines(words: list[dict], y_tolerance: float = 3.0) -> list[str]:
    """Group pdfplumber words into visual lines by `top` coordinate."""
    if not words:
        return []

    # Sort first so clustering is deterministic
    words_sorted = sorted(words, key=lambda w: (w.get('top', 0.0), w.get('x0', 0.0)))
    lines: list[list[dict]] = []
    current: list[dict] = []
    current_top: float | None = None

    for w in words_sorted:
        top = float(w.get('top', 0.0))
        if current_top is None:
            current_top = top
            current = [w]
            continue
        if abs(top - current_top) <= y_tolerance:
            current.append(w)
        else:
            lines.append(current)
            current = [w]
            current_top = top

    if current:
        lines.append(current)

    out_lines: list[str] = []
    for line_words in lines:
        line_words_sorted = sorted(line_words, key=lambda w: float(w.get('x0', 0.0)))
        txt = ' '.join((w.get('text') or '').strip() for w in line_words_sorted)
        txt = _clean_text(txt)
        if txt:
            out_lines.append(txt)
    return out_lines

In [3]:
TOC_HEADER_RE = re.compile(r"\b(contents|index)\b", re.IGNORECASE)

# Matches: (Section Name)......(Page Number)  OR  (Section Name)    (Page Number)
TOC_ENTRY_RE = re.compile(
    r"^(?P<title>.+?)\s*(?:\.{2,}|\s{2,}|\u00b7{2,}|\-\-\-+)\s*(?P<page>\d{1,4})\s*$"
    r"|^(?P<title2>.+?)\s+(?P<page2>\d{1,4})\s*$",
    re.IGNORECASE,
)

NUM_ONLY_RE = re.compile(r"^\s*(\d{1,4})\s*$")

MDNA_TITLE_RE = re.compile(
    r"management\s+discussion(?:s)?\s*(?:and|&)\s*analysis(?:\s+report)?",
    re.IGNORECASE,
)

TERMINATOR_TITLE_RE = re.compile(
    r"\b(corporate\s+governance|auditors?\s*[’']?\s*report|independent\s+auditor|financial\s+statements?|balance\s+sheet|standalone\s+financial|consolidated\s+financial)\b",
    re.IGNORECASE,
)


def detect_toc_pages_hybrid(pdf_path: pathlib.Path, scan_pages: int = TOC_SCAN_PAGES) -> list[int]:
    """Hybrid switching logic: scan first N pages with fitz; if header contains CONTENTS/INDEX, mark as ToC page for pdfplumber parsing."""
    toc_pages: list[int] = []
    with fitz.open(pdf_path) as doc:
        n = min(scan_pages, doc.page_count)
        for i in range(n):
            page = doc.load_page(i)
            # spec: header is first 1000 chars of extracted text (fast check)
            head_sample = (page.get_text('text') or '')[:1000]
            if TOC_HEADER_RE.search(head_sample):
                toc_pages.append(i + 1)
    return toc_pages


def extract_toc_lines_pdfplumber(pdf_path: pathlib.Path, toc_pages: list[int]) -> list[str]:
    """Extract ToC lines (layout-aware) using pdfplumber for specified 1-based pages."""
    lines: list[str] = []
    if not toc_pages:
        return lines

    with pdfplumber.open(str(pdf_path)) as pdf:
        for pno in toc_pages:
            if pno < 1 or pno > len(pdf.pages):
                continue
            page = pdf.pages[pno - 1]
            # words-based line clustering tends to preserve title↔page alignment better than plain extract_text()
            words = page.extract_words(use_text_flow=True, keep_blank_chars=False) or []
            page_lines = _cluster_words_into_lines(words, y_tolerance=3.0)
            for ln in page_lines:
                s = (ln or '').strip()
                if s:
                    lines.append(s)
    return lines


def parse_toc_entries(lines: Iterable[str], max_page: int) -> list[dict]:
    """Parse ToC lines into ordered entries with multiline-title merge (critical).

    Rules (per spec):
      - If a line has text but no page number, append it to the next line that has a page number.
      - Also handle the variant where the next line is *only* a page number.
    """
    entries: list[dict] = []
    pending: list[str] = []

    def _flush_pending_keep_last(max_keep: int = 3):
        nonlocal pending
        if len(pending) > max_keep:
            pending = pending[-max_keep:]

    for raw in lines:
        s = _clean_text(raw)
        if not s:
            continue

        # Skip ToC headings / boilerplate
        if TOC_HEADER_RE.fullmatch(s) or re.fullmatch(r"page\s*(no\.?|number)?", s, re.IGNORECASE):
            continue

        m_num = NUM_ONLY_RE.match(s)
        if m_num:
            if pending:
                page = int(m_num.group(1))
                if 1 <= page <= max_page:
                    title = _normalize_title(' '.join(pending))
                    entries.append({"title": title, "page": page, "raw": f"{title} -> {page}"})
                pending = []
            continue

        m = TOC_ENTRY_RE.match(s)
        if m:
            title = m.group('title') or m.group('title2') or ''
            page_s = m.group('page') or m.group('page2') or ''
            try:
                page = int(page_s)
            except ValueError:
                page = None

            if page is not None and (1 <= page <= max_page) and re.search(r"[A-Za-z]", title or ''):
                title = _normalize_title(title)
                if pending:
                    title = _normalize_title(' '.join(pending + [title]))
                    pending = []
                entries.append({"title": title, "page": page, "raw": s})
                continue

        # No page number: buffer it to merge into the next ToC entry
        if re.search(r"[A-Za-z]", s) and not TOC_HEADER_RE.search(s) and not re.search(r"\bpage\b", s, re.IGNORECASE):
            pending.append(_normalize_title(s))
            _flush_pending_keep_last()
        else:
            # non-title noise resets pending
            pending = []

    return entries


def _is_mdna_title(title: str, company_folder: str | None = None) -> bool:
    t = _normalize_title(title).lower()

    # Core MD&A
    if MDNA_TITLE_RE.search(t):
        # Alchemist: do NOT treat Directors' Report as MD&A
        if company_folder and company_folder.lower() in {'alcheimist', 'alchemist'}:
            if 'director' in t and 'report' in t:
                return False
        return True

    # Amit Spinning edge: 'Board’s Report Including Management Discussions & Analysis Report'
    if 'including' in t and 'board' in t and ('management' in t) and (('discussion' in t) or ('discussions' in t)) and ('analysis' in t):
        return True

    return False


def find_mdna_range_from_toc(pdf_path: pathlib.Path, company_folder: str | None = None) -> tuple[Optional[int], Optional[int], dict]:
    """Try ToC-driven MD&A start/end. Returns (start, end, debug)."""
    debug = {"toc_pages": [], "toc_entries": [], "mdna_entry": None}
    with fitz.open(pdf_path) as doc:
        max_page = doc.page_count

    toc_pages = detect_toc_pages_hybrid(pdf_path, scan_pages=TOC_SCAN_PAGES)
    debug["toc_pages"] = toc_pages
    if not toc_pages:
        return None, None, debug

    toc_lines = extract_toc_lines_pdfplumber(pdf_path, toc_pages)
    toc_entries = parse_toc_entries(toc_lines, max_page=max_page)
    debug["toc_entries"] = toc_entries
    if not toc_entries:
        return None, None, debug

    mdna_idx = None
    for i, ent in enumerate(toc_entries):
        if _is_mdna_title(ent.get('title', ''), company_folder=company_folder):
            mdna_idx = i
            debug["mdna_entry"] = ent
            break

    if mdna_idx is None:
        return None, None, debug

    start_page = int(toc_entries[mdna_idx]['page'])

    # End page: next section in ToC (first entry after MD&A with higher page number)
    next_page = None
    for ent in toc_entries[mdna_idx + 1 :]:
        p = ent.get('page')
        if isinstance(p, int) and p > start_page:
            next_page = p
            break

    end_page = (next_page - 1) if next_page else max_page
    if end_page < start_page:
        return None, None, debug

    return start_page, end_page, debug

In [4]:
def find_mdna_range_by_header_fallback(pdf_path: pathlib.Path) -> tuple[Optional[int], Optional[int], dict]:
    """Fallback: search page headers for MD&A; derive end by next major section header."""
    debug = {"start_hit_header": None, "end_hit_header": None}

    with fitz.open(pdf_path) as doc:
        page_count = doc.page_count
        scan_upto = page_count if HEADER_SCAN_PAGES_FALLBACK is None else min(page_count, int(HEADER_SCAN_PAGES_FALLBACK))

        start = None
        for i in range(scan_upto):
            page = doc.load_page(i)
            header = _header_text_fitz(page)
            if MDNA_TITLE_RE.search(header):
                start = i + 1
                debug['start_hit_header'] = {'page': start, 'header': header}
                break

        if start is None:
            return None, None, debug

        end = page_count
        for j in range(start, page_count):
            page = doc.load_page(j)
            header = _header_text_fitz(page)
            if TERMINATOR_TITLE_RE.search(header):
                end = j  # page before terminator (since j is 0-based, j==page_no-1)
                debug['end_hit_header'] = {'page': j + 1, 'header': header}
                break

        if end < start:
            return None, None, debug

        return start, end, debug


def extract_mdna_text(pdf_path: pathlib.Path, start_page: int, end_page: int) -> str:
    with fitz.open(pdf_path) as doc:
        start_i = max(0, start_page - 1)
        end_i = min(doc.page_count - 1, end_page - 1)
        parts = []
        for i in range(start_i, end_i + 1):
            page = doc.load_page(i)
            parts.append(_extract_page_text_fitz_blocks(page))
    return _clean_text('\n'.join(p for p in parts if p))

In [30]:
def extract_company_name_from_folder(pdf_path: pathlib.Path) -> str:
    return pdf_path.parent.name.replace('_', ' ').strip()


def extract_financial_year_fitz(pdf_path: pathlib.Path, max_pages: int = 5) -> Optional[str]:
    """Best-effort year extraction from first pages."""
    year_patterns = [
        re.compile(r"\b\d{1,3}(?:st|nd|rd|th)\s+Annual\s+Report\s+(\d{4})\s*[-–]\s*(\d{2,4})\b", re.IGNORECASE),
        re.compile(r"\bAnnual\s+Report\s+(\d{4})\s*[-–]\s*(\d{2,4})\b", re.IGNORECASE),
        re.compile(r"\bYear\s+ended\s+\w+\s+\d{1,2},\s+(\d{4})\b", re.IGNORECASE),
    ]

    with fitz.open(pdf_path) as doc:
        n = min(max_pages, doc.page_count)
        for i in range(n):
            page = doc.load_page(i)
            txt = page.get_text('text') or ''
            for pat in year_patterns:
                m = pat.search(txt)
                if not m:
                    continue
                groups = m.groups()
                if len(groups) == 1:
                    year = groups[0]
                    prev = str(int(year) - 1)
                    return f"{prev}-{year[-2:]}"
                y1, y2 = groups
                return f"{y1}-{y2[-2:]}"
    return None


def process_pdf(pdf_path: pathlib.Path) -> dict:
    company_folder = pdf_path.parent.name
    company = extract_company_name_from_folder(pdf_path)
    year = extract_financial_year_fitz(pdf_path)

    start, end, toc_debug = find_mdna_range_from_toc(pdf_path, company_folder=company_folder)
    method = 'toc' if (start and end) else None

    if start is None or end is None:
        start, end, hdr_debug = find_mdna_range_by_header_fallback(pdf_path)
        if start and end:
            method = 'header_fallback'
        else:
            hdr_debug = {}

    if start is None or end is None:
        return {
            'Filename': pdf_path.name,
            'Company': company,
            'Year': year,
            'MD&A_Text': '',
            'StartPage': None,
            'EndPage': None,
            'Method': method or 'failed',
        }

    mdna_text = extract_mdna_text(pdf_path, start, end)

    # Light guard for Alchemist: avoid Directors' Report bleed if ToC was wrong
    if company_folder.lower() in {'alcheimist', 'alchemist'}:
        if re.search(r"\bdirectors\s*[’']?\s*report\b", mdna_text[:2000], re.IGNORECASE):
            logger.warning('Alchemist: extracted text seems to include Directors\' Report; trying header fallback range')
            s2, e2, _ = find_mdna_range_by_header_fallback(pdf_path)
            if s2 and e2:
                mdna_text = extract_mdna_text(pdf_path, s2, e2)
                start, end = s2, e2
                method = 'header_fallback'

    return {
        'Filename': pdf_path.name,
        'Company': company,
        'Year': year,
        'MD&A_Text': mdna_text,
        'StartPage': start,
        'EndPage': end,
        'Method': method,
    }

In [4]:
pdf_paths = sorted(PDF_ROOT.rglob('*.pdf'))
logger.info('Found %d PDFs under %s', len(pdf_paths), PDF_ROOT)

out_cols = ['Filename', 'Company', 'Year', 'MD&A_Text']

if not pdf_paths:
    df = pd.DataFrame(columns=out_cols)
    df.to_csv(CSV_PATH, index=False)
    logger.warning('No PDFs found. Wrote empty CSV: %s', CSV_PATH)
else:
    results = []
    for pdf_path in tqdm(pdf_paths, desc='Extracting MD&A'):
        try:
            results.append(process_pdf(pdf_path))
        except Exception as e:
            logger.exception('Failed on %s: %s', pdf_path, e)
            results.append({
                'Filename': pdf_path.name,
                'Company': extract_company_name_from_folder(pdf_path),
                'Year': extract_financial_year_fitz(pdf_path),
                'MD&A_Text': '',
                'StartPage': None,
                'EndPage': None,
                'Method': 'exception',
            })

    df = pd.DataFrame(results)
    df[out_cols].to_csv(CSV_PATH, index=False)
    logger.info('Wrote %s', CSV_PATH)

df[out_cols].head(10)

2026-01-01 01:23:15,347 - INFO - Found 14 PDFs under C:\Users\LOQ\Desktop\SPJIMR\mdna_extraction_project\data\pdfs
Extracting MD&A:   0%|          | 0/14 [00:00<?, ?it/s]2026-01-01 01:23:15,465 - ERROR - Failed on C:\Users\LOQ\Desktop\SPJIMR\mdna_extraction_project\data\pdfs\Alcheimist\5267070319.pdf: name 'process_pdf' is not defined
Traceback (most recent call last):
  File "C:\Users\LOQ\AppData\Local\Temp\ipykernel_31440\3964556588.py", line 14, in <module>
    results.append(process_pdf(pdf_path))
                   ^^^^^^^^^^^
NameError: name 'process_pdf' is not defined
Extracting MD&A:   0%|          | 0/14 [00:00<?, ?it/s]


2026-01-01 01:23:15,347 - INFO - Found 14 PDFs under C:\Users\LOQ\Desktop\SPJIMR\mdna_extraction_project\data\pdfs
Extracting MD&A:   0%|          | 0/14 [00:00<?, ?it/s]2026-01-01 01:23:15,465 - ERROR - Failed on C:\Users\LOQ\Desktop\SPJIMR\mdna_extraction_project\data\pdfs\Alcheimist\5267070319.pdf: name 'process_pdf' is not defined
Traceback (most recent call last):
  File "C:\Users\LOQ\AppData\Local\Temp\ipykernel_31440\3964556588.py", line 14, in <module>
    results.append(process_pdf(pdf_path))
                   ^^^^^^^^^^^
NameError: name 'process_pdf' is not defined
Extracting MD&A:   0%|          | 0/14 [00:00<?, ?it/s]


NameError: name 'extract_company_name_from_folder' is not defined