In [1]:
import fitz
def inspect_font_sizes(pdf_path: str, page_number: int):
    """
    Print font size and text spans from a specific page in the PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        page_number (int): 0-based index of the page to inspect.
    """
    doc = fitz.open(pdf_path)
    
    if page_number < 0 or page_number >= len(doc):
        print("Invalid page number.")
        return

    print(f"\n--- Font Info from Page {page_number + 1} ---")
    page = doc[page_number]
    blocks = page.get_text("dict")["blocks"]

    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    size = round(span["size"], 2)
                    font = span["font"]
                    text = span["text"].strip()
                    color = span["color"]
                    x, y = round(span["bbox"][0], 2), round(span["bbox"][1], 2)
                    if text:
                        print(f"[x={x}, y={y}] Font Size: {size:>5} | Font: {font:<20} | color:{color} Text: {text}")

In [4]:
pdf_path = "../datasets/depression/nimh_Depression.pdf"

In [11]:
inspect_font_sizes(pdf_path, page_number=8)  # Change page as needed


--- Font Info from Page 9 ---
[x=410.76, y=604.15] Font Size:   7.0 | Font: GillSansStd          | color:16777215 Text: 
[x=54.0, y=70.55] Font Size:  11.5 | Font: GillSansStd-Italic   | color:16022304 Text: How do women experience depression?
[x=54.0, y=88.62] Font Size:  10.5 | Font: MinionPro-Regular    | color:5394214 Text: Depression is more common among women than among men.
[x=54.0, y=100.62] Font Size:  10.5 | Font: MinionPro-Regular    | color:5394214 Text: Biological, life cycle, hormonal and psychosocial factors unique
[x=54.0, y=112.62] Font Size:  10.5 | Font: MinionPro-Regular    | color:5394214 Text: to women may be linked to women’s higher depression rate.
[x=54.0, y=124.62] Font Size:  10.5 | Font: MinionPro-Regular    | color:5394214 Text: Researchers have shown that hormones directly affect brain chem-
[x=54.0, y=136.62] Font Size:  10.5 | Font: MinionPro-Regular    | color:5394214 Text: istry that controls emotions and mood. For example, women are
[x=54.0, y=148.6

In [15]:
import fitz
import os

def extract_depression_guide(pdf_path):
    doc = fitz.open(pdf_path)
    
    # State tracking
    current_main_topic = ""
    current_sub_topic = ""
    current_content = []
    current_pages = []
    extracted_records = []
    
    def save_current_record():
        """Save current topic record if we have content"""
        if current_content and current_main_topic:
            content_text = " ".join(current_content).strip()
            if content_text:
                record = {
                    'main_topic': current_main_topic,
                    'sub_topic': current_sub_topic,
                    'content': content_text,
                    'pages': list(set(current_pages)),  # Remove duplicates
                    'source': 'NIMH Depression Guide'
                }
                extracted_records.append(record)
    
    # Process each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")
        
        for block in blocks["blocks"]:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    font = span["font"]
                    size = span["size"]
                    color = span.get("color", 0)
                    
                    if not text:
                        continue
                    
                    # MAIN TOPIC (Font size 13.0, color 16022304)
                    if (size == 13.0 and color == 16022304 and "Italic" not in font):
                        # Save previous topic record
                        save_current_record()
                        
                        # Start new main topic
                        current_main_topic = text
                        current_sub_topic = ""  # Reset sub topic
                        current_content = []
                        current_pages = []
                        
                        print(f"Found main topic: {current_main_topic}")
                    
                    # SUB TOPIC (Font size 11.5, Italic, color 16022304)
                    elif (size == 11.5 and 
                          "Italic" in font and 
                          color == 16022304):
                        # Save previous content before starting new sub topic
                        save_current_record()
                        
                        # Start new sub topic
                        current_sub_topic = text
                        current_content = []
                        current_pages = []
                        
                        print(f"Found sub topic: {current_sub_topic}")
                    
                    # CONTENT (Everything else)
                    else:
                        # Skip very short text, numbers, and highlighted key points
                        if (len(text) > 3 and 
                            not text.isdigit() and
                            color != 16777215 and  # Skip key points (white/highlighted)
                            current_main_topic):
                            
                            current_content.append(text)
                            current_pages.append(page_num + 1)
    
    # Save final topic record
    save_current_record()
    
    doc.close()
    return extracted_records

In [16]:
depression_data = extract_depression_guide(pdf_path)

Found main topic: Contents
Found main topic: What Is Depression?
Found main topic: What are the different forms of depression?
Found main topic: What are the symptoms of depression?
Found main topic: What illnesses often co-exist with depression?
Found main topic: What causes depression?
Found sub topic: How do women experience depression?
Found sub topic: How do men experience depression?
Found sub topic: How do older adults experience depression?
Found sub topic: How do children and adolescents experience depression?
Found main topic: How is depression detected and treated?
Found sub topic: Medication
Found sub topic: What are the side effects of antidepressants?
Found sub topic: The most common side effects associated with
Found sub topic: SSRIs and SNRIs include:
Found sub topic: Tricyclic antidepressants also can cause side effects
Found sub topic: including:
Found sub topic: What about St. John’s wort?
Found sub topic: Psychotherapy
Found sub topic: Electroconvulsive Therapy
Foun

In [None]:
depression_data

In [19]:
import json
with open("../datasets/depression/depression_structured_data_extract.json", 'w', encoding='utf-8') as f:
    json.dump(depression_data, f, ensure_ascii=False, indent=2)