In [1]:
import fitz
import os

def extract_disaster_management(pdf_path):
    doc = fitz.open(pdf_path)
    
    # State tracking
    current_disaster_type = ""
    current_content = []
    current_pages = []
    extracted_records = []
    
    def save_current_record():
        """Save current disaster record if we have content"""
        if current_content and current_disaster_type:
            content_text = " ".join(current_content).strip()
            if content_text:
                record = {
                    'disaster_type': current_disaster_type,
                    'content': content_text,
                    'pages': list(set(current_pages)),  # Remove duplicates
                    'source': 'Disaster Management India'
                }
                extracted_records.append(record)
    
    # Process each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")
        
        for block in blocks["blocks"]:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    font = span["font"]
                    size = span["size"]
                    
                    if not text:
                        continue
                    
                    # DISASTER TYPE (Font size 23.5)
                    if size == 23.5:
                        # Save previous disaster record
                        save_current_record()
                        
                        # Start new disaster
                        current_disaster_type = text
                        current_content = []
                        current_pages = []
                        
                        print(f"Found disaster type: {current_disaster_type}")
                    
                    # CONTENT (Everything else)
                    else:
                        # Skip very short text and numbers
                        if (len(text) > 3 and 
                            not text.isdigit() and 
                            current_disaster_type):
                            
                            current_content.append(text)
                            current_pages.append(page_num + 1)
    
    # Save final disaster record
    save_current_record()
    
    doc.close()
    return extracted_records



In [2]:
# Usage
disaster_data = extract_disaster_management("../datasets/disaster_management/natural_hazards_disaster_management_india.pdf")


Found disaster type: Cyclone: Do's & Dont's
Found disaster type: Tsunami
Found disaster type: Heat wave: Do's & Dont's
Found disaster type: Landslide: Do's & Dont's
Found disaster type: Urban Floods: Do’s & Don’ts
Found disaster type: Floods: Do’s & Don’ts
Found disaster type: Earthquakes: Do’s & Don’ts
Found disaster type: Emergency Response Support System


In [5]:
import json
with open("../datasets/disaster_management/disaster_management_structured_data_extract.json", 'w', encoding='utf-8') as f:
    json.dump(disaster_data, f, ensure_ascii=False, indent=2)