In [None]:
# Cell 1: Imports and Path Setup
import json
import pandas as pd
from pathlib import Path
import os

# Resolve project root safely
if "__file__" in globals():
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
else:
    PROJECT_ROOT = Path(os.getcwd()).resolve().parent

# Define input/output paths
# MERGED_PATH = PROJECT_ROOT / "output" / "03_merged_headings.json"
# OUTPUT_PATH = PROJECT_ROOT / "output" / "structured_outline.json"

# Define input/output paths
MERGED_PATH = PROJECT_ROOT / "output" / "03_merged_headings.json"

# Dynamically derive JSON name based on input PDF name stored temporarily
pdf_name = os.environ.get("PDF_NAME", "structured_outline")
pdf_stem = Path(pdf_name).stem
OUTPUT_PATH = PROJECT_ROOT / "output" / f"{pdf_stem}.json"


# Ensure output directory exists
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# Debug info (optional)
print("📁 Project Root:", PROJECT_ROOT)
print("📥 Merged Headings Path:", MERGED_PATH)
print("📤 Structured Outline Path:", OUTPUT_PATH)


📁 Project Root: C:\Users\Adi Awaskar\Documents\GitHub\Adobe-Hackathon
📥 Merged Headings Path: C:\Users\Adi Awaskar\Documents\GitHub\Adobe-Hackathon\output\03_merged_headings.json
📤 Structured Outline Path: C:\Users\Adi Awaskar\Documents\GitHub\Adobe-Hackathon\output\structured_outline.json


In [43]:
with open(MERGED_PATH, "r", encoding="utf-8") as f:
    merged = json.load(f)

df = pd.DataFrame(merged)
print("✅ Loaded", len(df), "merged heading candidates")
df.head(2)


✅ Loaded 179 merged heading candidates


Unnamed: 0,page,text,font_size_avg,is_bold,is_all_caps,y_position_norm,word_count,char_count,starts_with_number,ends_with_colon,is_centered,is_heading
0,1,Multilingual PDF Heading Detection: Architectu...,18.0,True,False,0.115,5,48,False,False,False,True
1,1,Automated heading detection in PDFs is ...,9.0,True,False,0.18,12,105,False,False,False,False


In [None]:
def is_title(text_block):
    return (
        text_block["page"] == 1 and
        text_block["font_size_avg"] >= 20 and
        text_block["word_count"] <= 10
    )

def get_level(block):
    size = block.get("font_size_avg", 0)
    if size >= 18:
        return "H1"
    elif size >= 14:
        return "H2"
    else:
        return "H3"


In [None]:
def is_title(block):
    return (
        block.get("page") == 1 and
        block.get("font_size_avg", 0) >= 18 and
        block.get("level") == "H1" and
        block.get("word_count", 0) <= 12 and
        len(block.get("text", "")) > 5
    )



📘 Detected Title: Untitled


In [None]:
# In[45]:

# Assign heading levels before identifying title
for block in merged:
    block["level"] = get_level(block)

title_block = next((b for b in merged if is_title(b)), None)
title = title_block["text"] if title_block else "Untitled"
print("📘 Detected Title:", title)


[1] H1 | Page 1: Multilingual PDF Heading Detection: Architecture and Solution
[2] H3 | Page 1: Automated   heading  detection  in  PDFs  is  critical  for  document  understanding,  accessibility  and downstream tasks. A robust solution must identify section titles with high precision/recall (score focus
[3] H3 | Page 1: ) while running efficiently in a containerized environment and handling multiple languages (e.g. Japanese).
[4] H3 | Page 1: We propose a pipeline that ingests PDFs, extracts layout and text features, classifies heading lines, and
[5] H3 | Page 1: outputs a structured outline. This modular architecture (parse  →  features  →  classify  →  output) follows industry best practices. For example, enterprise workflows separate “ingestion” and “retrieval” stages for


In [47]:
structured = {
    "title": title,
    "outline": [
        {
            "level": block["level"],
            "text": block["text"],
            "page": block["page"]
        }
        for block in merged
    ]
}

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(structured, f, indent=2, ensure_ascii=False)

print(f"✅ Structured JSON saved to:\n{OUTPUT_PATH}")


✅ Structured JSON saved to:
C:\Users\Adi Awaskar\Documents\GitHub\Adobe-Hackathon\output\structured_outline.json


In [48]:
# Preview first 1000 characters (optional full print)
print(json.dumps(structured, indent=2, ensure_ascii=False)[:1000])


{
  "title": "Untitled",
  "outline": [
    {
      "level": "H1",
      "text": "Multilingual PDF Heading Detection: Architecture and Solution",
      "page": 1
    },
    {
      "level": "H3",
      "text": "Automated   heading  detection  in  PDFs  is  critical  for  document  understanding,  accessibility  and downstream tasks. A robust solution must identify section titles with high precision/recall (score focus",
      "page": 1
    },
    {
      "level": "H3",
      "text": ") while running efficiently in a containerized environment and handling multiple languages (e.g. Japanese).",
      "page": 1
    },
    {
      "level": "H3",
      "text": "We propose a pipeline that ingests PDFs, extracts layout and text features, classifies heading lines, and",
      "page": 1
    },
    {
      "level": "H3",
      "text": "outputs a structured outline. This modular architecture (parse  →  features  →  classify  →  output) follows industry best practices. For example, enterprise workf