In [29]:
import fitz
def inspect_font_sizes(pdf_path: str, page_number: int):
    """
    Print font size and text spans from a specific page in the PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        page_number (int): 0-based index of the page to inspect.
    """
    doc = fitz.open(pdf_path)
    
    if page_number < 0 or page_number >= len(doc):
        print("Invalid page number.")
        return

    print(f"\n--- Font Info from Page {page_number + 1} ---")
    page = doc[page_number]
    blocks = page.get_text("dict")["blocks"]

    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    size = round(span["size"], 2)
                    font = span["font"]
                    text = span["text"].strip()
                    color = span["color"]
                    x, y = round(span["bbox"][0], 2), round(span["bbox"][1], 2)
                    if text:
                        print(f"[x={x}, y={y}] Font Size: {size:>5} | Font: {font:<20} | color:{color} Text: {text}")

In [30]:
pdf_path = "../datasets/rice_diseases/india_rice_diseases.pdf"

In [33]:
inspect_font_sizes(pdf_path, page_number=1)  # Change page as needed


--- Font Info from Page 2 ---
[x=95.25, y=77.59] Font Size:  10.5 | Font: ArialMT              | color:0 Text: ●​
[x=113.25, y=77.59] Font Size:  10.5 | Font: ArialMT              | color:0 Text: The size and shape of the spots vary on different rice varieties.
[x=218.66, y=120.65] Font Size:  10.5 | Font: Arial-BoldMT         | color:16711680 Text: Leaf Blast
[x=95.25, y=145.54] Font Size:  10.5 | Font: ArialMT              | color:0 Text: ●​
[x=113.25, y=145.54] Font Size:  10.5 | Font: ArialMT              | color:0 Text: Severe cases of infection - entire crop give a blasted or burnt
[x=113.25, y=167.1] Font Size:  10.5 | Font: ArialMT              | color:0 Text: appearance- hence the name "BLAST"
[x=95.25, y=188.66] Font Size:  10.5 | Font: ArialMT              | color:0 Text: ●​
[x=113.25, y=188.66] Font Size:  10.5 | Font: ArialMT              | color:0 Text: Severe cases - lodging of crop (after ear emergence)
[x=243.75, y=221.22] Font Size:  10.5 | Font: ArialMT             

In [19]:
def extract_hierarchical_combinations(pdf_path):
    doc = fitz.open(pdf_path)
    
    # State tracking
    current_disease = ""
    current_organism = ""
    current_main_heading = ""
    current_sub_heading = ""
    
    combinations = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")
        
        for block in blocks["blocks"]:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    font = span["font"]
                    size = span["size"]
                    color = span.get("color", 0)
                    
                    if not text or text in ["(", ")"]:
                        continue
                    
                    # Update state based on detection
                    if (color == 16750848 and size == 21.0 and "Bold" in font and "Italic" not in font):
                        current_disease = text
                        current_organism = ""
                        current_main_heading = ""
                        current_sub_heading = ""
                    
                    elif (color == 16750848 and size == 21.0 and "BoldItalic" in font):
                        current_organism = text
                    
                    elif (color == 16776991 and size == 12.0):
                        current_main_heading = text
                        current_sub_heading = ""  # Reset sub heading
                    
                    elif (color == 16711680):
                        current_sub_heading = text
                        
                        # CREATE RECORD when we have a complete combination
                        if current_disease and current_main_heading and current_sub_heading:
                            combination = {
                                'disease': current_disease,
                                'causal_organism': current_organism,
                                'main_heading': current_main_heading,
                                'sub_heading': current_sub_heading,
                                'page': page_num + 1
                            }
                            combinations.append(combination)
    
    return combinations

In [None]:
extract_hierarchical_combinations(pdf_path)

In [40]:
import fitz
import os
import re

def extract_rice_diseases_with_images(pdf_path, output_dir):
    doc = fitz.open(pdf_path)
    
    # Create output directory for images
    os.makedirs(output_dir, exist_ok=True)
    
    # State tracking
    current_disease = ""
    current_organism = ""
    current_main_heading = ""
    current_sub_heading = ""
    
    # Content and image accumulation
    current_content = []
    current_pages = []
    current_images = []
    extracted_records = []
    
    def clean_filename(text):
        """Clean text for use as filename"""
        # Remove special characters and replace spaces with underscores
        cleaned = re.sub(r'[^\w\s-]', '', text)
        cleaned = re.sub(r'\s+', '_', cleaned)
        return cleaned[:50]  # Limit length
    
    def extract_and_save_images(page, page_num):
        """Extract images from page and return list of saved image paths"""
        images = page.get_images(full=True)
        image_paths = []
        
        if not images:
            return image_paths
        
        # Get all text with color 255 (captions)
        text_dict = page.get_text("dict")
        captions = []
        
        for block in text_dict["blocks"]:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        if span.get("color", 0) == 255:  # Caption color
                            text = span["text"].strip()
                            if len(text) > 3:  # Meaningful caption
                                captions.append({
                                    'text': text,
                                    'x': span["bbox"][0],
                                    'y': span["bbox"][1]
                                })
        
        # Sort captions by Y coordinate
        captions.sort(key=lambda x: x['y'])
        
        # Extract and save images
        for i, img in enumerate(images):
            try:
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                
                # Convert to RGB if necessary
                if pix.n - pix.alpha < 3:
                    pix = fitz.Pixmap(fitz.csRGB, pix)
                
                # Find corresponding caption
                caption = f"image_{i+1}"  # Default caption
                if i < len(captions):
                    caption = clean_filename(captions[i]['text'])
                
                # Save image
                filename = f"{page_num}_{caption}_{i+1}.png"
                filepath = os.path.join(output_dir, filename)
                pix.save(filepath)
                image_paths.append(filename)
                
                pix = None  # Clean up
                
            except Exception as e:
                print(f"Error extracting image {i} from page {page_num}: {e}")
                continue
        
        return image_paths
    
    def save_current_record():
        """Save current state as a record if we have content"""
        if current_content and current_disease and current_main_heading:
            content_text = " ".join(current_content).strip()
            if content_text:
                record = {
                    'disease': current_disease,
                    'causal_organism': current_organism,
                    'main_heading': current_main_heading,
                    'sub_heading': current_sub_heading,
                    'content': content_text,
                    'pages': list(set(current_pages)),
                    'images': current_images.copy(),
                    'source': 'Rice Diseases'
                }
                extracted_records.append(record)
    
    # Process each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")
        
        # Extract images from this page
        page_images = extract_and_save_images(page, page_num + 1)
        page_image_index = 0  # Track which image we're on for this page
        
        for block in blocks["blocks"]:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    font = span["font"]
                    size = span["size"]
                    color = span.get("color", 0)
                    
                    if not text or text in ["(", ")"]:
                        continue
                    
                    # DISEASE (Chapter level) - RESETS everything
                    if (color == 16750848 and size == 21.0 and "Bold" in font and "Italic" not in font):
                        save_current_record()
                        
                        current_disease = text
                        current_organism = ""
                        current_main_heading = ""
                        current_sub_heading = ""
                        current_content = []
                        current_pages = []
                        current_images = []
                    
                    # CAUSAL ORGANISM (Chapter level)
                    elif (color == 16750848 and size == 21.0 and "BoldItalic" in font):
                        current_organism = text
                    
                    # MAIN HEADING (Unit level)
                    elif (color == 16776991 and size == 12.0):
                        save_current_record()
                        
                        current_main_heading = text
                        current_sub_heading = ""
                        current_content = []
                        current_pages = []
                        current_images = []
                        page_image_index = 0  # Reset image index for new section
                    
                    # SUB HEADING (Sub-unit level)
                    elif (color == 16711680):
                        save_current_record()
                        
                        current_sub_heading = text
                        current_content = []
                        current_pages = []
                        current_images = []
                        
                        # Assign next available image(s) to this sub-heading
                        if page_image_index < len(page_images):
                            current_images.append(page_images[page_image_index])
                            page_image_index += 1
                    
                    # CONTENT (Regular text)
                    else:
                        if (len(text) > 3 and 
                            not text.isdigit() and 
                            color != 255 and  # Skip image captions
                            current_disease and 
                            current_main_heading):
                            
                            current_content.append(text)
                            current_pages.append(page_num + 1)
    
    # Save final content
    save_current_record()
    
    doc.close()
    return extracted_records



In [43]:
output_dir = "/Users/saikumarallaka/kaggle/gemma_3n_impact_challenge/datasets/rice_diseases/images"
extracted_records = extract_rice_diseases_with_images(pdf_path, output_dir)

In [46]:
import json
with open("../datasets/rice_diseases/rice_diseases_structured_data_extract.json", 'w', encoding='utf-8') as f:
    json.dump(extracted_records, f, ensure_ascii=False, indent=2)