In [24]:
import json
from pathlib import Path
import pandas as pd
import os

# Handle path resolution for both script and notebook
if "__file__" in globals():
    PROJECT_ROOT = Path(__file__).resolve().parent.parent
else:
    PROJECT_ROOT = Path().resolve()

DATA_DIR = PROJECT_ROOT / "data" / "processed"
OUTPUT_DIR = PROJECT_ROOT / "output"

INPUT_PATH = DATA_DIR / "pdf_blocks_features.json"
OUTPUT_PATH = OUTPUT_DIR / "03_merged_headings.json"

# Load enriched blocks
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    blocks = json.load(f)

df = pd.DataFrame(blocks)
print(f"Loaded {len(df)} blocks")
df.head()


Loaded 141 blocks


Unnamed: 0,page,text,font_size_avg,is_bold,is_all_caps,y_position_norm,word_count,char_count,starts_with_number,ends_with_colon,is_centered
0,1,"Generated by Superset | 16 Jul 2025, 12:23 P...",11.0,False,False,0.068,10,51,False,False,False
1,1,Chevron Engineering,24.0,False,False,0.068,2,19,False,False,False
2,1,GET / PGET,16.0,False,True,0.068,3,10,False,False,False
3,1,Job Profile Details,12.0,False,False,0.167,3,19,False,False,False
4,1,Placement Cycle,11.0,False,False,0.208,2,15,False,False,False


In [None]:
def is_heading(block):
    return (
        block.get("norm_font", 0) >= 0.9 and
        block["word_count"] <= 12 and
        block["is_bold"] and
        not block["text"].endswith(".") and
        not block["text"][0].islower()
    )


In [26]:
df["is_heading"] = df.apply(is_heading, axis=1)
df[["text", "is_heading"]].head(10)


Unnamed: 0,text,is_heading
0,"Generated by Superset | 16 Jul 2025, 12:23 P...",False
1,Chevron Engineering,False
2,GET / PGET,False
3,Job Profile Details,False
4,Placement Cycle,False
5,VJTI Placements 2026 Batch,False
6,Job Location,False
7,Bengaluru,False
8,Date of Visit,False
9,"Jul 28, 2025",False


In [None]:
def merge_headings_flex(df):
    merged_blocks = []
    i = 0
    while i < len(df):
        current = df.iloc[i].to_dict()
        merged_text = current["text"]
        j = i + 1

        current_bbox = current.get("bbox", [0, 0, 0, 0])
        x0_curr = current_bbox[0]

        while j < len(df):
            nxt = df.iloc[j].to_dict()
            next_bbox = nxt.get("bbox", [0, 0, 0, 0])
            x0_next = next_bbox[0]

            # Calculate merge conditions safely
            close_y = abs(nxt.get("y_position_norm", 0) - current.get("y_position_norm", 0)) < 0.03
            same_font = abs(nxt.get("font_size_avg", 0) - current.get("font_size_avg", 0)) < 1
            starts_lower = nxt.get("text", "").strip().startswith(tuple("abcdefghijklmnopqrstuvwxyz"))
            same_indent = abs(x0_next - x0_curr) < 20

            if close_y and same_font and (starts_lower or same_indent):
                merged_text += " " + nxt.get("text", "")
                j += 1
            else:
                break

        current["text"] = merged_text
        merged_blocks.append(current)
        i = j

    return merged_blocks


In [28]:
merged_headings = merge_headings_flex(df)  # Use all blocks
print(f"Detected and merged {len(merged_headings)} headings")


Detected and merged 131 headings


In [29]:
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# Save final headings
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(merged_headings, f, indent=2, ensure_ascii=False)

# Preview
for i, h in enumerate(merged_headings[:10], 1):
    print(f"[{i}] Page {h['page']}: {h['text']}")


[1] Page 1: Generated by  Superset  | 16 Jul 2025, 12:23 PM IST
[2] Page 1: Chevron Engineering
[3] Page 1: GET / PGET
[4] Page 1: Job Profile Details
[5] Page 1: Placement Cycle
[6] Page 1: VJTI Placements 2026 Batch
[7] Page 1: Job Location
[8] Page 1: Bengaluru
[9] Page 1: Date of Visit
[10] Page 1: Jul 28, 2025


In [30]:
# import os
# import json

# # Go to root directory no matter where notebook is running from
# ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
# OUTPUT_DIR = os.path.join(ROOT_DIR, "output")
# os.makedirs(OUTPUT_DIR, exist_ok=True)

# OUTFILE = os.path.join(OUTPUT_DIR, "03_merged_headings.json")

# with open(OUTFILE, "w", encoding="utf-8") as f:
#     json.dump(merged_headings, f, indent=2, ensure_ascii=False)

# print("✅ Saved to:", OUTFILE)

