In [2]:
import fitz  # PyMuPDF
from PIL import Image
import io
import os
import re
import json

def hex_color(value):
    return "#{:06x}".format(value)

def merge_uppercase_spans(spans):
    result = []
    buffer = []
    for span in spans:
        text = span["text"]
        if text.isupper() or (len(text) == 1 and text.isalpha()):
            buffer.append(text)
        else:
            if buffer:
                result.append("".join(buffer))
                buffer = []
            result.append(text)
    if buffer:
        result.append("".join(buffer))
    return " ".join(result).strip()

MAIN_FONT_SIZE = 24.0
MAIN_COLOR = "#00aeef"
SUB_FONT_SIZE = 14.0
SUB_COLOR = "#00aeef"
Y_THRESHOLD = 20



def extract_structured_data_with_images(pdf_path, start_page=13, end_page=253, image_dir="../datasets/education/images"):
    os.makedirs(image_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    structured_data = []
    current_main_heading = None
    current_chunk = {}
    sub_buffer = []
    last_y = None

    for page_num in range(start_page, end_page):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]

        # 🖼️ IMAGE PROCESSING
        images = page.get_images(full=True)
        image_blocks = [
            {"x0": b[0], "y0": b[1], "x1": b[2], "y1": b[3], "text": b[4]}
            for b in page.get_text("blocks") if isinstance(b[4], str)
        ]

        for i, img in enumerate(images, start=1):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            rects = page.get_image_rects(xref)
            if not rects:
                continue
            bbox = rects[0]
            width, height = bbox.width, bbox.height
            if width > 250 and height > 250:
                continue

            fig_name = None
            for block in sorted(image_blocks, key=lambda b: b["y0"]):
                if bbox.y1 <= block["y0"] <= bbox.y1 + 100:
                    match = re.search(r"Fig\.?\s*[\d\.]+[a-zA-Z]?", block["text"])
                    if match:
                        fig_name = match.group(0).replace(" ", "")
                        break

            if pix.n - pix.alpha != 3 or pix.alpha:
                # Convert to RGB if not already RGB or has alpha
                pix_new = fitz.Pixmap(fitz.csRGB, pix)
                pix = pix_new

            img_bytes = pix.tobytes("png")
            filename = f"pagenumber_{page_num + 1}_image{i}.png"
            file_path = os.path.join(image_dir, filename)
            with open(file_path, "wb") as f:
                f.write(img_bytes)

            if current_chunk.get("sub_heading"):
                current_chunk["images"].append({
                    "caption": fig_name if fig_name else "",
                    "page": page_num + 1,
                    "path": file_path
                })

        # 📝 TEXT PROCESSING
        for block in blocks:
            for line in block.get("lines", []):
                spans = line.get("spans", [])
                if not spans:
                    continue

                merged_text = merge_uppercase_spans(spans)
                font_size = round(spans[0]["size"], 1)
                font_color = hex_color(spans[0]["color"])
                y_coord = spans[0]["origin"][1]

                if font_size == MAIN_FONT_SIZE and font_color == MAIN_COLOR:
                    if current_chunk.get("sub_heading"):
                        current_chunk["page_end"] = page_num + 1
                        structured_data.append(current_chunk)
                    current_main_heading = merged_text
                    continue

                if font_size == SUB_FONT_SIZE and font_color == SUB_COLOR:
                    if sub_buffer and abs(y_coord - last_y) <= Y_THRESHOLD:
                        sub_buffer.append(merged_text)
                    else:
                        if sub_buffer:
                            if current_chunk.get("sub_heading"):
                                current_chunk["page_end"] = page_num + 1
                                structured_data.append(current_chunk)
                            current_chunk = {
                                "main_heading": current_main_heading,
                                "sub_heading": " ".join(sub_buffer),
                                "enrichment": None,
                                "page_start": page_num + 1,
                                "page_end": None,
                                "content": "",
                                "images": []
                            }
                            sub_buffer = []
                        sub_buffer.append(merged_text)
                    last_y = y_coord
                    continue
                else:
                    if sub_buffer:
                        if current_chunk.get("sub_heading"):
                            current_chunk["page_end"] = page_num + 1
                            structured_data.append(current_chunk)
                        current_chunk = {
                            "main_heading": current_main_heading,
                            "sub_heading": " ".join(sub_buffer),
                            "enrichment": None,
                            "page_start": page_num + 1,
                            "page_end": None,
                            "content": "",
                            "images": []
                        }
                        sub_buffer = []

                if current_chunk.get("sub_heading"):
                    current_chunk["content"] += " " + merged_text

                if merged_text.lower().startswith("fig"):
                    if "images" not in current_chunk:
                        current_chunk["images"] = []
                    current_chunk["images"].append({
                        "caption": merged_text,
                        "page": page_num + 1
                    })

    if current_chunk.get("sub_heading"):
        current_chunk["page_end"] = end_page
        structured_data.append(current_chunk)

    doc.close()
    return structured_data

# 🧪 Example usage
pdf_path = "../datasets/education/science_text_book_class8_india.pdf"
structured_data = extract_structured_data_with_images(pdf_path)

In [5]:
with open("../datasets/education/education_structured_data_extract.json", 'w', encoding='utf-8') as f:
    json.dump(structured_data, f, ensure_ascii=False, indent=2)