## Understand structure

In [3]:
import fitz  # PyMuPDF

In [9]:
def inspect_font_sizes(pdf_path: str, page_number: int):
    """
    Print font size and text spans from a specific page in the PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        page_number (int): 0-based index of the page to inspect.
    """
    doc = fitz.open(pdf_path)
    
    if page_number < 0 or page_number >= len(doc):
        print("Invalid page number.")
        return

    print(f"\n--- Font Info from Page {page_number + 1} ---")
    page = doc[page_number]
    blocks = page.get_text("dict")["blocks"]

    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    size = round(span["size"], 2)
                    font = span["font"]
                    text = span["text"].strip()
                    x, y = round(span["bbox"][0], 2), round(span["bbox"][1], 2)
                    if text:
                        print(f"[x={x}, y={y}] Font Size: {size:>5} | Font: {font:<20} | Text: {text}")

In [26]:
pdf_path = "../datasets/ayurveda/ayurveda_home_remedies.pdf"
inspect_font_sizes(pdf_path, page_number=12)  # Change page as needed


--- Font Info from Page 13 ---
[x=175.7, y=66.17] Font Size:  10.5 | Font: BookAntiqua-Bold     | Text: DHANIA
[x=99.9, y=84.03] Font Size:  10.5 | Font: BookAntiqua          | Text: (
[x=106.3, y=84.05] Font Size:  10.5 | Font: BookAntiqua-Italic   | Text: Conundrum sativum
[x=205.7, y=84.03] Font Size:  10.5 | Font: BookAntiqua          | Text: Linn., Dhanyaka)
[x=57.1, y=109.79] Font Size:  14.0 | Font: TimesNewRomanPSMT    | Text: Cold/ Cough
[x=175.7, y=113.83] Font Size:  10.5 | Font: BookAntiqua          | Text: 20 ml. decoction prepared from
[x=175.4, y=128.23] Font Size:  10.5 | Font: BookAntiqua          | Text: 5 gm.coarse powder with sugar
[x=175.4, y=142.63] Font Size:  10.5 | Font: BookAntiqua          | Text: and turmeric powder thrice a
[x=175.7, y=156.73] Font Size:  10.5 | Font: BookAntiqua          | Text: day.
[x=175.7, y=171.63] Font Size:  10.5 | Font: BookAntiqua          | Text: or use of dhania powder as herbal
[x=175.9, y=186.03] Font Size:  10.5 | Font: Book

In [17]:
import fitz

def extract_ailments_from_page(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    spans = []

    for block in page.get_text("dict")["blocks"]:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                if span["bbox"][0] < 80:  # x < 80
                    spans.append({
                        "y": span["bbox"][1],
                        "text": span["text"].strip()
                    })

    # Sort by y
    spans.sort(key=lambda x: x["y"])

    # Group into ailments based on y-gap
    ailments = []
    current_ailment = []
    prev_y = None

    for span in spans:
        if prev_y is None or (span["y"] - prev_y) < 20:
            current_ailment.append(span["text"])
        else:
            if current_ailment:
                ailments.append(" ".join(current_ailment))
            current_ailment = [span["text"]]
        prev_y = span["y"]

    # Add last group
    if current_ailment:
        ailments.append(" ".join(current_ailment))

    return ailments

In [20]:
extract_ailments_from_page(pdf_path, page_number=12)

['Cold/ Cough',
 'Intestinal worms',
 'Sunstroke/ Dehydration',
 'Indigestion',
 'Fever']

In [45]:
import fitz  # PyMuPDF

def extract_ailments_and_remedies(pdf_path, page_number, source="Ayurveda"):
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    text_data = []

    for block in page.get_text("dict")["blocks"]:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text_data.append({
                    "x": span["bbox"][0],
                    "y": span["bbox"][1],
                    "text": span["text"].strip()
                })

    # Step 1: Extract Ailments (x < 80)
    left_texts = [t for t in text_data if t["x"] < 80]
    left_texts.sort(key=lambda t: t["y"])

    ailments = []
    current = []
    prev_y = None
    for span in left_texts:
        if prev_y is None or (span["y"] - prev_y) < 20:
            current.append(span)
        else:
            if current:
                ailment_text = " ".join([s["text"] for s in current])
                ailments.append({
                    "name": ailment_text,
                    "y_start": current[0]["y"]
                })
            current = [span]
        prev_y = span["y"]
    if current:
        ailment_text = " ".join([s["text"] for s in current])
        ailments.append({
            "name": ailment_text,
            "y_start": current[0]["y"]
        })

    # Step 2: Remedies (x > 100)
    remedies_texts = [t for t in text_data if t["x"] > 100]

    # Add y_end and extract remedies
    for i in range(len(ailments)):
        start_y = ailments[i]["y_start"] - 5
        end_y = (
            ailments[i + 1]["y_start"] - 15
            if i + 1 < len(ailments)
            else float("inf")
        )
        remedy_lines = [
            t["text"] for t in remedies_texts
            if start_y <= t["y"] <= end_y
        ]
        remedies_concat = " ".join(remedy_lines).strip()
        ailments[i] = {
            "name": ailments[i]["name"],
            "remedies": remedies_concat,
            "page": page_number + 1,  # human-readable page number
            "source": source
        }
    return ailments

In [None]:
extract_ailments_and_remedies(pdf_path, page_number=13)

In [27]:
import fitz  # PyMuPDF

def extract_herb_data_from_page(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc[page_number]

    # Step 1: Get all text with position info
    text_data = []
    for block in page.get_text("dict")["blocks"]:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text_data.append({
                    "x": span["bbox"][0],
                    "y": span["bbox"][1],
                    "font_size": span["size"],
                    "font": span["font"],
                    "text": span["text"].strip()
                })

    # Step 2: Extract herb name (Bold) and scientific name (Italic, left side)
    herb_name = ""
    scientific_name = ""
    for t in text_data:
        if "Bold" in t["font"] and t["text"].isupper():
            herb_name = t["text"]
        elif "Italic" in t["font"] and t["x"] < 150:
            scientific_name += t["text"] + " "

    scientific_name = scientific_name.strip()

    # Step 3: Extract ailment lines (x < 80)
    ailment_lines = [t for t in text_data if t["x"] < 80]
    ailment_lines.sort(key=lambda t: t["y"])

    # Group ailments using y-gap ≥ 20
    ailments = []
    current = {"name": "", "y": None}
    for i, t in enumerate(ailment_lines):
        if i == 0 or t["y"] - ailment_lines[i-1]["y"] >= 20:
            if current["name"]:
                ailments.append(current.copy())
            current = {"name": t["text"], "y": t["y"]}
        else:
            current["name"] += " " + t["text"]

    # Add last ailment
    ailments.append(current.copy())

    # Step 4: Get remedies between ailment blocks
    for i, ailment in enumerate(ailments):
        y_start = ailment["y"] - 5
        y_end = ailments[i + 1]["y"] - 15 if i + 1 < len(ailments) else 10000
        remedy_lines = [
            t["text"] for t in text_data
            if t["x"] > 100 and y_start <= t["y"] <= y_end
        ]

        # Remove digit-only page number from end if present
        if remedy_lines and remedy_lines[-1].isdigit():
            remedy_lines = remedy_lines[:-1]

        # Combine into a single string
        remedy_text = " ".join(remedy_lines)
        ailment["remedies"] = remedy_text

        # Clean internal data
        del ailment["y"]

    return {
        "herb": herb_name,
        "scientific_name": scientific_name,
        "ailments": ailments
    }

In [44]:
extract_herb_data_from_page(pdf_path, page_number=22)

{'herb': 'LAUNG',
 'scientific_name': 'Syzygium aromaticum',
 'ailments': [{'name': '( Syzygium aromaticum',
   'remedies': '(L.) Merr & Perry, Lavanga)'},
  {'name': 'Cough',
   'remedies': 'To be chewed frequently or 1 gm. powder with honey 2-3 times in divided doses. 20 ml warm deoction prepared by puting 1 gm. clove 3-4 times daily. It is usefull both in dry and productive cough.'},
  {'name': 'Cold / Hiccough',
   'remedies': '1-2 gm. of powder with honey in three divided doses.'},
  {'name': 'Indigestion', 'remedies': '1-2 gm. powder with warm water.'},
  {'name': 'Tooth ache',
   'remedies': 'Cr >hed clove should be kept in carious tooth.'},
  {'name': 'Bad breath',
   'remedies': 'Small piece should be chewed frequently.'},
  {'name': 'Ear pain',
   'remedies': 'Warm coconut oil prepared by boiling the pow der of Laung  should be filled in ear twice daily (Do not use w hen there is discharge).'}]}

In [None]:
with open("../datasets/education/education_structured_data_extract.json", 'w', encoding='utf-8') as f:
    json.dump(structured_data, f, ensure_ascii=False, indent=2)