In [2]:
import re
import nltk
from typing import Dict, List, Optional
from transformers import pipeline
import spacy

class MedicalDocumentParser:
    """Class for parsing different types of medical documents"""
    
    def __init__(self):
        # Load NLP models
        self.nlp = spacy.load("en_core_web_md")
        # Initialize regex patterns for different document sections
        self.section_patterns = {
            "patient_info": re.compile(r"(?:Patient Information|Patient Details|Personal Details)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)", re.DOTALL),
            "diagnosis": re.compile(r"(?:Diagnosis|Clinical Impression|Assessment)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)", re.DOTALL),
            # Add more patterns for other sections
        }
    
    def parse_document(self, document_text: str, document_type: str = "discharge") -> Dict:
        """
        Parse the document text into structured sections
        
        Args:
            document_text: Raw text of the medical document
            document_type: Type of document (discharge, claim, etc.)
            
        Returns:
            Dictionary containing structured document sections
        """
        # Process document with NLP
        doc = self.nlp(document_text)
        
        # Extract sections based on document type
        if document_type == "discharge":
            return self._parse_discharge_summary(document_text, doc)
        elif document_type == "claim":
            return self._parse_claim_document(document_text, doc)
        else:
            return self._parse_generic_medical_document(document_text, doc)
    
    def _parse_discharge_summary(self, text: str, doc) -> Dict:
        """Parse discharge summary into structured sections"""
        sections = {}
        
        # Extract patient information
        patient_match = self.section_patterns["patient_info"].search(text)
        if patient_match:
            sections["patient_info"] = self._extract_patient_details(patient_match.group(1))
        
        # Extract diagnosis
        diagnosis_match = self.section_patterns["diagnosis"].search(text)
        if diagnosis_match:
            sections["diagnosis"] = diagnosis_match.group(1).strip()
        
        # Extract medications using NER
        medications = []
        for ent in doc.ents:
            if ent.label_ == "MEDICATION" or ent.label_ == "CHEMICAL":
                medications.append(ent.text)
        sections["medications"] = list(set(medications))  # Remove duplicates
        
        # Extract other relevant sections...
        
        return sections
    
    def _parse_claim_document(self, text: str, doc) -> Dict:
        """Parse insurance claim document into structured sections"""
        # Implementation for claim documents
        sections = {}
        
        # Extract policy details
        policy_info = self._extract_policy_details(text)
        sections["policy_info"] = policy_info
        
        # Extract claim amount details
        sections["claim_details"] = self._extract_claim_details(text)
        
        # Extract dates using NER
        dates = {}
        date_labels = ["DATE_ADMISSION", "DATE_DISCHARGE", "DATE_CLAIM"]
        for ent in doc.ents:
            if ent.label_ == "DATE":
                # Try to classify the type of date
                context = text[max(0, ent.start_char-30):min(len(text), ent.end_char+30)]
                for label in date_labels:
                    if label.lower().replace("date_", "") in context.lower():
                        dates[label.lower().replace("date_", "")] = ent.text
        sections["dates"] = dates
        
        return sections
    
    def _extract_patient_details(self, text: str) -> Dict:
        """Extract structured patient information"""
        details = {}
        
        # Extract patient name
        name_match = re.search(r"(?:Name|Patient Name)(?:\s*:)?\s*([A-Za-z\s.]+)", text)
        if name_match:
            details["name"] = name_match.group(1).strip()
        
        # Extract age
        age_match = re.search(r"(?:Age|Years)(?:\s*:)?\s*(\d+)\s*(?:years|yrs)?", text)
        if age_match:
            details["age"] = int(age_match.group(1))
        
        # Extract gender
        gender_match = re.search(r"(?:Gender|Sex)(?:\s*:)?\s*([A-Za-z]+)", text)
        if gender_match:
            details["gender"] = gender_match.group(1).strip()
        
        # Extract contact number
        contact_match = re.search(r"(?:Contact|Phone|Mobile)(?:\s*:)?\s*(\+?\d[\d\s-]{8,})", text)
        if contact_match:
            details["contact"] = contact_match.group(1).strip()
        
        # Extract Aadhaar number (with proper format validation)
        aadhaar_match = re.search(r"(?:Aadhaar|ID|Identity Number)(?:\s*:)?\s*(\d{4}\s*\d{4}\s*\d{4})", text)
        if aadhaar_match:
            details["aadhaar"] = aadhaar_match.group(1).replace(" ", "")
        
        return details
    
    def _extract_policy_details(self, text: str) -> Dict:
        """Extract insurance policy details from text"""
        details = {}
        
        # Extract policy number
        policy_match = re.search(r"(?:Policy\sNumber|Policy\sNo)(?:\s*:)?\s*([A-Z0-9-/]+)", text)
        if policy_match:
            details["policy_number"] = policy_match.group(1).strip()
        
        # Extract insurer name
        insurer_match = re.search(r"(?:Insurer|Insurance\sCompany)(?:\s*:)?\s*([A-Za-z\s.]+)", text)
        if insurer_match:
            details["insurer"] = insurer_match.group(1).strip()
        
        # Extract sum insured
        sum_match = re.search(r"(?:Sum\sInsured|Coverage\sAmount)(?:\s*:)?\s*(?:Rs\.?|INR)?\s*([\d,]+)", text)
        if sum_match:
            # Remove commas and convert to integer
            details["sum_insured"] = int(sum_match.group(1).replace(",", ""))
        
        return details
    
    def _extract_claim_details(self, text: str) -> Dict:
        """Extract claim amount and details"""
        details = {}
        
        # Extract claimed amount
        claimed_match = re.search(r"(?:Claimed\sAmount|Total\sClaim)(?:\s*:)?\s*(?:Rs\.?|INR)?\s*([\d,]+)", text)
        if claimed_match:
            details["claimed_amount"] = int(claimed_match.group(1).replace(",", ""))
        
        # Extract approved amount if available
        approved_match = re.search(r"(?:Approved\sAmount|Sanctioned\sAmount)(?:\s*:)?\s*(?:Rs\.?|INR)?\s*([\d,]+)", text)
        if approved_match:
            details["approved_amount"] = int(approved_match.group(1).replace(",", ""))
        
        # Extract claim status
        status_match = re.search(r"(?:Status|Claim\sStatus)(?:\s*:)?\s*([A-Za-z\s]+)", text)
        if status_match:
            details["status"] = status_match.group(1).strip()
        
        return details


class MedicalDocumentSummarizer:
    """Class for summarizing medical documents"""
    
    def __init__(self):
        # Initialize the document parser
        self.parser = MedicalDocumentParser()
        
        # Initialize NLP summarization pipeline
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        
        # Initialize different summary templates
        self.templates = {
            "discharge": self._discharge_summary_template,
            "claim": self._claim_summary_template,
            "generic": self._generic_summary_template
        }
    
    def summarize(self, document_text: str, document_type: str = "discharge", 
                  max_length: int = 150, format_type: str = "plain") -> str:
        """
        Generate a concise summary of the medical document
        
        Args:
            document_text: Raw text of the medical document
            document_type: Type of document (discharge, claim, etc.)
            max_length: Maximum length of the detailed summary in tokens
            format_type: Output format (plain, html, json)
            
        Returns:
            Formatted summary of the document
        """
        # Parse the document
        parsed_doc = self.parser.parse_document(document_text, document_type)
        
        # Generate template-based summary
        if document_type in self.templates:
            template_fn = self.templates[document_type]
            summary = template_fn(parsed_doc)
        else:
            summary = self.templates["generic"](parsed_doc)
        
        # For longer documents, augment with NLP-based summary
        if len(document_text) > 1000:
            # Extract main body for summarization (excluding headers, patient info)
            main_body = self._extract_main_body(document_text)
            if main_body:
                nlp_summary = self.summarizer(main_body, max_length=max_length)[0]['summary_text']
                summary += f"\n\nKey Points:\n{nlp_summary}"
        
        # Format the summary according to requested format
        return self._format_summary(summary, format_type)
    
    def _discharge_summary_template(self, parsed_doc: Dict) -> str:
        """Create summary from parsed discharge document"""
        summary = "DISCHARGE SUMMARY\n\n"
        
        # Add patient information if available
        if "patient_info" in parsed_doc:
            patient = parsed_doc["patient_info"]
            summary += f"Patient: {patient.get('name', 'N/A')}"
            if 'age' in patient and 'gender' in patient:
                summary += f" ({patient.get('age')}Y, {patient.get('gender')})\n"
            else:
                summary += "\n"
        
        # Add diagnosis if available
        if "diagnosis" in parsed_doc:
            summary += f"Diagnosis: {parsed_doc['diagnosis']}\n"
        
        # Add medications if available
        if "medications" in parsed_doc and parsed_doc["medications"]:
            summary += f"Medications: {', '.join(parsed_doc['medications'][:5])}"
            if len(parsed_doc["medications"]) > 5:
                summary += f" and {len(parsed_doc['medications']) - 5} more"
            summary += "\n"
        
        # Add other important information
        if "treatment" in parsed_doc:
            summary += f"Treatment: {parsed_doc['treatment']}\n"
        
        if "follow_up" in parsed_doc:
            summary += f"Follow-up: {parsed_doc['follow_up']}\n"
        
        return summary
    
    def _claim_summary_template(self, parsed_doc: Dict) -> str:
        """Create summary from parsed claim document"""
        summary = "INSURANCE CLAIM SUMMARY\n\n"
        
        # Add patient information
        if "patient_info" in parsed_doc:
            patient = parsed_doc["patient_info"]
            summary += f"Patient: {patient.get('name', 'N/A')}\n"
        
        # Add policy information
        if "policy_info" in parsed_doc:
            policy = parsed_doc["policy_info"]
            summary += f"Policy: {policy.get('policy_number', 'N/A')}"
            if "insurer" in policy:
                summary += f" ({policy['insurer']})"
            summary += "\n"
            
            if "sum_insured" in policy:
                summary += f"Coverage: ₹{policy['sum_insured']:,}\n"
        
        # Add claim details
        if "claim_details" in parsed_doc:
            claim = parsed_doc["claim_details"]
            if "claimed_amount" in claim:
                summary += f"Claimed: ₹{claim['claimed_amount']:,}\n"
            
            if "approved_amount" in claim:
                summary += f"Approved: ₹{claim['approved_amount']:,}\n"
                
            if "status" in claim:
                summary += f"Status: {claim['status']}\n"
        
        # Add dates
        if "dates" in parsed_doc:
            dates = parsed_doc["dates"]
            if "admission" in dates:
                summary += f"Admission: {dates['admission']}\n"
            if "discharge" in dates:
                summary += f"Discharge: {dates['discharge']}\n"
        
        return summary
    
    def _generic_summary_template(self, parsed_doc: Dict) -> str:
        """Create a generic summary when document type is unknown"""
        summary = "MEDICAL DOCUMENT SUMMARY\n\n"
        
        # Try to extract any available information
        for section, content in parsed_doc.items():
            if isinstance(content, dict):
                summary += f"{section.replace('_', ' ').title()}:\n"
                for key, value in content.items():
                    if value:  # Only include non-empty values
                        summary += f"  - {key.replace('_', ' ').title()}: {value}\n"
            elif isinstance(content, list):
                summary += f"{section.replace('_', ' ').title()}: "
                summary += ", ".join(content[:5])
                if len(content) > 5:
                    summary += f" and {len(content) - 5} more"
                summary += "\n"
            else:
                summary += f"{section.replace('_', ' ').title()}: {content}\n"
        
        return summary
    
    def _extract_main_body(self, document_text: str) -> str:
        """Extract the main body of the document, excluding headers and patient info"""
        # Remove common headers
        text = re.sub(r"DISCHARGE SUMMARY|CLAIM FORM|PATIENT INFORMATION", "", document_text)
        
        # Try to find the beginning of the main content
        # This is usually after patient information section
        matches = re.search(r"(?:HISTORY|DIAGNOSIS|CLINICAL DETAILS|TREATMENT DETAILS)", text)
        if matches:
            start_pos = matches.start()
            return text[start_pos:]
        
        return text
    
    def _format_summary(self, summary: str, format_type: str) -> str:
        """Format the summary in different output formats"""
        if format_type == "plain":
            return summary
        
        elif format_type == "html":
            # Convert to HTML format
            html = "<div class='medical-summary'>\n"
            
            # Convert section headers to h2
            html += re.sub(r"^([A-Z\s]+):?$", r"<h2>\1</h2>", summary, flags=re.MULTILINE)
            
            # Convert lines with key-value pairs to formatted paragraphs
            html = re.sub(r"^([\w\s]+): (.+)$", r"<p><strong>\1:</strong> \2</p>", html, flags=re.MULTILINE)
            
            # Convert any remaining paragraphs
            html = re.sub(r"^([^<\n].+)$", r"<p>\1</p>", html, flags=re.MULTILINE)
            
            html += "</div>"
            return html
        
        elif format_type == "json":
            # Parse summary into JSON structure
            json_data = {}
            
            # Extract section headers
            current_section = "general"
            json_data[current_section] = {}
            
            for line in summary.split("\n"):
                # Check if this is a section header
                if re.match(r"^[A-Z\s]+:?$", line):
                    current_section = line.strip().lower().replace(" ", "_").replace(":", "")
                    json_data[current_section] = {}
                # Check if this is a key-value pair
                elif ":" in line:
                    parts = line.split(":", 1)
                    key = parts[0].strip().lower().replace(" ", "_")
                    value = parts[1].strip()
                    json_data[current_section][key] = value
                # Skip empty lines
                elif line.strip():
                    # This is just text belonging to the current section
                    if "text" not in json_data[current_section]:
                        json_data[current_section]["text"] = []
                    json_data[current_section]["text"].append(line.strip())
            
            import json
            return json.dumps(json_data, indent=2)
        
        else:
            raise ValueError(f"Unsupported format type: {format_type}")


# Example usage
if __name__ == "__main__":
    document_text = """
    DISCHARGE SUMMARY
    
    Patient Information:
    Name: John Doe
    Age: 45 years
    Gender: Male
    Contact: +91 9876543210
    Aadhaar: 1234 5678 9012
    
    Diagnosis:
    Type 2 Diabetes Mellitus with Hypertension
    
    Treatment Details:
    Patient underwent diabetic management and blood pressure control protocol.
    Medications prescribed include Metformin 500mg BD, Glimepiride 1mg OD, 
    and Telmisartan 40mg OD.
    
    Follow-up:
    Review after 2 weeks with fasting blood glucose report and BP monitoring chart.
    """
    
    summarizer = MedicalDocumentSummarizer()
    
    # Generate plain text summary
    plain_summary = summarizer.summarize(document_text, document_type="discharge")
    print("PLAIN TEXT SUMMARY:")
    print(plain_summary)
    print("\n" + "-"*50 + "\n")
    
    # Generate HTML summary
    html_summary = summarizer.summarize(document_text, document_type="discharge", format_type="html")
    print("HTML SUMMARY:")
    print(html_summary)

OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.

Correction:

In [4]:
import re
import nltk
from typing import Dict, List, Optional
from transformers import pipeline
import traceback

# Try to import spacy and the required model
try:
    import spacy
    nlp = spacy.load("en_core_web_md")
except (ImportError, OSError) as e:
    print("NLP model loading error: ", str(e))
    print("\nTo fix this issue:")
    print("1. Make sure spaCy is installed: pip install spacy")
    print("2. Download the required model: python -m spacy download en_core_web_md")
    
    # Define a fallback for when spaCy isn't available
    class DummyNLP:
        def __call__(self, text):
            class DummyDoc:
                def __init__(self):
                    self.ents = []
            return DummyDoc()
    nlp = DummyNLP()


class MedicalDocumentParser:
    """Class for parsing different types of medical documents"""
    
    def __init__(self):
        # Initialize regex patterns for different document sections
        self.section_patterns = {
            "patient_info": re.compile(r"(?:Patient Information|Patient Details|Personal Details)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)", re.DOTALL),
            "diagnosis": re.compile(r"(?:Diagnosis|Clinical Impression|Assessment)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)", re.DOTALL),
            # Add more patterns for other sections
        }
    
    def parse_document(self, document_text: str, document_type: str = "discharge") -> Dict:
        """
        Parse the document text into structured sections
        
        Args:
            document_text: Raw text of the medical document
            document_type: Type of document (discharge, claim, etc.)
            
        Returns:
            Dictionary containing structured document sections
        """
        # Process document with NLP
        try:
            doc = nlp(document_text)
        except Exception as e:
            print(f"Warning: NLP processing failed: {str(e)}")
            doc = None
        
        # Extract sections based on document type
        if document_type == "discharge":
            return self._parse_discharge_summary(document_text, doc)
        elif document_type == "claim":
            return self._parse_claim_document(document_text, doc)
        else:
            return self._parse_generic_medical_document(document_text, doc)
    
    def _parse_discharge_summary(self, text: str, doc) -> Dict:
        """Parse discharge summary into structured sections"""
        sections = {}
        
        # Extract patient information
        patient_match = self.section_patterns["patient_info"].search(text)
        if patient_match:
            sections["patient_info"] = self._extract_patient_details(patient_match.group(1))
        
        # Extract diagnosis
        diagnosis_match = self.section_patterns["diagnosis"].search(text)
        if diagnosis_match:
            sections["diagnosis"] = diagnosis_match.group(1).strip()
        
        # Extract medications using NER if doc is available
        if doc is not None:
            medications = []
            for ent in doc.ents:
                if hasattr(ent, 'label_') and (ent.label_ == "MEDICATION" or ent.label_ == "CHEMICAL"):
                    medications.append(ent.text)
            if medications:
                sections["medications"] = list(set(medications))  # Remove duplicates
        
        # Fallback extraction for medications using regex
        if "medications" not in sections:
            med_match = re.search(r"(?:Medications|Medicine|Drugs)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)", text, re.DOTALL)
            if med_match:
                # Extract medicine names (basic approach)
                med_text = med_match.group(1)
                med_list = re.findall(r"([A-Z][a-z]+(?:\s[A-Za-z]+)?)\s+\d+(?:mg|mcg|ml|g)", med_text)
                if med_list:
                    sections["medications"] = med_list
        
        # Extract treatment information
        treatment_match = re.search(r"(?:Treatment|Procedure|Management)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)", text, re.DOTALL)
        if treatment_match:
            sections["treatment"] = treatment_match.group(1).strip()
        
        # Extract follow-up information
        followup_match = re.search(r"(?:Follow[- ]up|Review|Next Visit)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)", text, re.DOTALL)
        if followup_match:
            sections["follow_up"] = followup_match.group(1).strip()
        
        return sections
    
    def _parse_claim_document(self, text: str, doc) -> Dict:
        """Parse insurance claim document into structured sections"""
        sections = {}
        
        # Extract policy details
        policy_info = self._extract_policy_details(text)
        sections["policy_info"] = policy_info
        
        # Extract claim amount details
        sections["claim_details"] = self._extract_claim_details(text)
        
        # Extract dates using regex
        dates = {}
        
        # Admission date
        admission_match = re.search(r"(?:Admission Date|Date of Admission)(?:\s*:)?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{1,2}\s+[A-Za-z]+\s+\d{2,4})", text)
        if admission_match:
            dates["admission"] = admission_match.group(1)
        
        # Discharge date
        discharge_match = re.search(r"(?:Discharge Date|Date of Discharge)(?:\s*:)?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{1,2}\s+[A-Za-z]+\s+\d{2,4})", text)
        if discharge_match:
            dates["discharge"] = discharge_match.group(1)
        
        # Claim submission date
        claim_date_match = re.search(r"(?:Claim Date|Date of Claim|Submission Date)(?:\s*:)?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{1,2}\s+[A-Za-z]+\s+\d{2,4})", text)
        if claim_date_match:
            dates["claim_submission"] = claim_date_match.group(1)
        
        sections["dates"] = dates
        
        return sections
    
    def _parse_generic_medical_document(self, text: str, doc) -> Dict:
        """Parse any medical document when type is unknown"""
        sections = {}
        
        # Try to identify the document type first
        if re.search(r"DISCHARGE|SUMMARY|CLINICAL SUMMARY", text, re.IGNORECASE):
            return self._parse_discharge_summary(text, doc)
        elif re.search(r"CLAIM|INSURANCE|REIMBURSEMENT", text, re.IGNORECASE):
            return self._parse_claim_document(text, doc)
        
        # Generic extraction for common elements
        
        # Extract patient details
        patient_info_patterns = [
            r"(?:PATIENT|PERSONAL) (?:DETAILS|INFORMATION)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)",
            r"(?:Name|Patient)(?:\s*:)?\s*([A-Za-z\s.]+)"
        ]
        
        for pattern in patient_info_patterns:
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                patient_text = match.group(1) if len(match.groups()) > 0 else ""
                sections["patient_info"] = self._extract_patient_details(patient_text or text)
                break
        
        # Try to extract medical information
        medical_sections = [
            ("diagnosis", r"(?:DIAGNOSIS|IMPRESSION|ASSESSMENT)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)"),
            ("treatment", r"(?:TREATMENT|PROCEDURE|MANAGEMENT)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)"),
            ("medications", r"(?:MEDICATIONS|MEDICINES|DRUGS)(?:\s*:)?(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)")
        ]
        
        for section_name, pattern in medical_sections:
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                sections[section_name] = match.group(1).strip()
        
        return sections
    
    def _extract_patient_details(self, text: str) -> Dict:
        """Extract structured patient information"""
        details = {}
        
        # Extract patient name
        name_match = re.search(r"(?:Name|Patient Name)(?:\s*:)?\s*([A-Za-z\s.]+)", text, re.IGNORECASE)
        if name_match:
            details["name"] = name_match.group(1).strip()
        
        # Extract age
        age_match = re.search(r"(?:Age|Years)(?:\s*:)?\s*(\d+)\s*(?:years|yrs)?", text, re.IGNORECASE)
        if age_match:
            details["age"] = int(age_match.group(1))
        
        # Extract gender
        gender_match = re.search(r"(?:Gender|Sex)(?:\s*:)?\s*([A-Za-z]+)", text, re.IGNORECASE)
        if gender_match:
            details["gender"] = gender_match.group(1).strip()
        
        # Extract contact number
        contact_match = re.search(r"(?:Contact|Phone|Mobile)(?:\s*:)?\s*(\+?\d[\d\s-]{8,})", text, re.IGNORECASE)
        if contact_match:
            details["contact"] = contact_match.group(1).strip()
        
        # Extract Aadhaar number (with proper format validation)
        aadhaar_match = re.search(r"(?:Aadhaar|ID|Identity Number)(?:\s*:)?\s*(\d{4}\s*\d{4}\s*\d{4})", text, re.IGNORECASE)
        if aadhaar_match:
            details["aadhaar"] = aadhaar_match.group(1).replace(" ", "")
        
        return details
    
    def _extract_policy_details(self, text: str) -> Dict:
        """Extract insurance policy details from text"""
        details = {}
        
        # Extract policy number
        policy_match = re.search(r"(?:Policy\sNumber|Policy\sNo)(?:\s*:)?\s*([A-Z0-9-/]+)", text, re.IGNORECASE)
        if policy_match:
            details["policy_number"] = policy_match.group(1).strip()
        
        # Extract insurer name
        insurer_match = re.search(r"(?:Insurer|Insurance\sCompany)(?:\s*:)?\s*([A-Za-z\s.]+)", text, re.IGNORECASE)
        if insurer_match:
            details["insurer"] = insurer_match.group(1).strip()
        
        # Extract sum insured
        sum_match = re.search(r"(?:Sum\sInsured|Coverage\sAmount)(?:\s*:)?\s*(?:Rs\.?|INR)?\s*([\d,]+)", text, re.IGNORECASE)
        if sum_match:
            # Remove commas and convert to integer
            try:
                details["sum_insured"] = int(sum_match.group(1).replace(",", ""))
            except ValueError:
                details["sum_insured"] = sum_match.group(1).replace(",", "")
        
        return details
    
    def _extract_claim_details(self, text: str) -> Dict:
        """Extract claim amount and details"""
        details = {}
        
        # Extract claimed amount
        claimed_match = re.search(r"(?:Claimed\sAmount|Total\sClaim)(?:\s*:)?\s*(?:Rs\.?|INR)?\s*([\d,]+)", text, re.IGNORECASE)
        if claimed_match:
            try:
                details["claimed_amount"] = int(claimed_match.group(1).replace(",", ""))
            except ValueError:
                details["claimed_amount"] = claimed_match.group(1).replace(",", "")
        
        # Extract approved amount if available
        approved_match = re.search(r"(?:Approved\sAmount|Sanctioned\sAmount)(?:\s*:)?\s*(?:Rs\.?|INR)?\s*([\d,]+)", text, re.IGNORECASE)
        if approved_match:
            try:
                details["approved_amount"] = int(approved_match.group(1).replace(",", ""))
            except ValueError:
                details["approved_amount"] = approved_match.group(1).replace(",", "")
        
        # Extract claim status
        status_match = re.search(r"(?:Status|Claim\sStatus)(?:\s*:)?\s*([A-Za-z\s]+)", text, re.IGNORECASE)
        if status_match:
            details["status"] = status_match.group(1).strip()
        
        return details


class MedicalDocumentSummarizer:
    """Class for summarizing medical documents"""
    
    def __init__(self):
        # Initialize the document parser
        self.parser = MedicalDocumentParser()
        
        # Try to initialize NLP summarization pipeline
        try:
            self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
            self.nlp_available = True
        except Exception as e:
            print(f"Warning: Could not initialize NLP summarization: {str(e)}")
            print("Install required packages with: pip install transformers torch")
            self.nlp_available = False
        
        # Initialize different summary templates
        self.templates = {
            "discharge": self._discharge_summary_template,
            "claim": self._claim_summary_template,
            "generic": self._generic_summary_template
        }
    
    def summarize(self, document_text: str, document_type: str = "discharge", 
                  max_length: int = 150, format_type: str = "plain") -> str:
        """
        Generate a concise summary of the medical document
        
        Args:
            document_text: Raw text of the medical document
            document_type: Type of document (discharge, claim, etc.)
            max_length: Maximum length of the detailed summary in tokens
            format_type: Output format (plain, html, json)
            
        Returns:
            Formatted summary of the document
        """
        try:
            # Parse the document
            parsed_doc = self.parser.parse_document(document_text, document_type)
            
            # Generate template-based summary
            if document_type in self.templates:
                template_fn = self.templates[document_type]
                summary = template_fn(parsed_doc)
            else:
                summary = self.templates["generic"](parsed_doc)
            
            # For longer documents, augment with NLP-based summary if available
            if len(document_text) > 1000 and self.nlp_available:
                try:
                    # Extract main body for summarization (excluding headers, patient info)
                    main_body = self._extract_main_body(document_text)
                    if main_body:
                        nlp_summary = self.summarizer(main_body, max_length=max_length)[0]['summary_text']
                        summary += f"\n\nKey Points:\n{nlp_summary}"
                except Exception as e:
                    print(f"NLP summarization failed: {str(e)}")
            
            # Format the summary according to requested format
            return self._format_summary(summary, format_type)
            
        except Exception as e:
            error_msg = f"Error during document summarization: {str(e)}\n"
            error_msg += traceback.format_exc()
            print(error_msg)
            return f"Error: Could not summarize document. {str(e)}"
    
    def _discharge_summary_template(self, parsed_doc: Dict) -> str:
        """Create summary from parsed discharge document"""
        summary = "DISCHARGE SUMMARY\n\n"
        
        # Add patient information if available
        if "patient_info" in parsed_doc:
            patient = parsed_doc["patient_info"]
            summary += f"Patient: {patient.get('name', 'N/A')}"
            if 'age' in patient and 'gender' in patient:
                summary += f" ({patient.get('age')}Y, {patient.get('gender')})\n"
            else:
                summary += "\n"
        
        # Add diagnosis if available
        if "diagnosis" in parsed_doc:
            summary += f"Diagnosis: {parsed_doc['diagnosis']}\n"
        
        # Add medications if available
        if "medications" in parsed_doc:
            if isinstance(parsed_doc["medications"], list):
                medications = parsed_doc["medications"]
                summary += f"Medications: {', '.join(medications[:5])}"
                if len(medications) > 5:
                    summary += f" and {len(medications) - 5} more"
                summary += "\n"
            else:
                summary += f"Medications: {parsed_doc['medications']}\n"
        
        # Add other important information
        if "treatment" in parsed_doc:
            summary += f"Treatment: {parsed_doc['treatment']}\n"
        
        if "follow_up" in parsed_doc:
            summary += f"Follow-up: {parsed_doc['follow_up']}\n"
        
        return summary

    # [Rest of the methods remain unchanged...]
    
    def _claim_summary_template(self, parsed_doc: Dict) -> str:
        """Create summary from parsed claim document"""
        summary = "INSURANCE CLAIM SUMMARY\n\n"
        
        # Add patient information
        if "patient_info" in parsed_doc:
            patient = parsed_doc["patient_info"]
            summary += f"Patient: {patient.get('name', 'N/A')}\n"
        
        # Add policy information
        if "policy_info" in parsed_doc:
            policy = parsed_doc["policy_info"]
            summary += f"Policy: {policy.get('policy_number', 'N/A')}"
            if "insurer" in policy:
                summary += f" ({policy['insurer']})"
            summary += "\n"
            
            if "sum_insured" in policy:
                summary += f"Coverage: ₹{policy['sum_insured']:,}\n"
        
        # Add claim details
        if "claim_details" in parsed_doc:
            claim = parsed_doc["claim_details"]
            if "claimed_amount" in claim:
                summary += f"Claimed: ₹{claim['claimed_amount']:,}\n"
            
            if "approved_amount" in claim:
                summary += f"Approved: ₹{claim['approved_amount']:,}\n"
                
            if "status" in claim:
                summary += f"Status: {claim['status']}\n"
        
        # Add dates
        if "dates" in parsed_doc:
            dates = parsed_doc["dates"]
            if "admission" in dates:
                summary += f"Admission: {dates['admission']}\n"
            if "discharge" in dates:
                summary += f"Discharge: {dates['discharge']}\n"
        
        return summary
    
    def _generic_summary_template(self, parsed_doc: Dict) -> str:
        """Create a generic summary when document type is unknown"""
        summary = "MEDICAL DOCUMENT SUMMARY\n\n"
        
        # Try to extract any available information
        for section, content in parsed_doc.items():
            if isinstance(content, dict):
                summary += f"{section.replace('_', ' ').title()}:\n"
                for key, value in content.items():
                    if value:  # Only include non-empty values
                        summary += f"  - {key.replace('_', ' ').title()}: {value}\n"
            elif isinstance(content, list):
                summary += f"{section.replace('_', ' ').title()}: "
                summary += ", ".join(content[:5])
                if len(content) > 5:
                    summary += f" and {len(content) - 5} more"
                summary += "\n"
            else:
                summary += f"{section.replace('_', ' ').title()}: {content}\n"
        
        return summary
    
    def _extract_main_body(self, document_text: str) -> str:
        """Extract the main body of the document, excluding headers and patient info"""
        # Remove common headers
        text = re.sub(r"DISCHARGE SUMMARY|CLAIM FORM|PATIENT INFORMATION", "", document_text)
        
        # Try to find the beginning of the main content
        # This is usually after patient information section
        matches = re.search(r"(?:HISTORY|DIAGNOSIS|CLINICAL DETAILS|TREATMENT DETAILS)", text)
        if matches:
            start_pos = matches.start()
            return text[start_pos:]
        
        return text
    
    def _format_summary(self, summary: str, format_type: str) -> str:
        """Format the summary in different output formats"""
        if format_type == "plain":
            return summary
        
        elif format_type == "html":
            # Convert to HTML format
            html = "<div class='medical-summary'>\n"
            
            # Convert section headers to h2
            html += re.sub(r"^([A-Z\s]+):?$", r"<h2>\1</h2>", summary, flags=re.MULTILINE)
            
            # Convert lines with key-value pairs to formatted paragraphs
            html = re.sub(r"^([\w\s]+): (.+)$", r"<p><strong>\1:</strong> \2</p>", html, flags=re.MULTILINE)
            
            # Convert any remaining paragraphs
            html = re.sub(r"^([^<\n].+)$", r"<p>\1</p>", html, flags=re.MULTILINE)
            
            html += "</div>"
            return html
        
        elif format_type == "json":
            # Parse summary into JSON structure
            json_data = {}
            
            # Extract section headers
            current_section = "general"
            json_data[current_section] = {}
            
            for line in summary.split("\n"):
                # Check if this is a section header
                if re.match(r"^[A-Z\s]+:?$", line):
                    current_section = line.strip().lower().replace(" ", "_").replace(":", "")
                    json_data[current_section] = {}
                # Check if this is a key-value pair
                elif ":" in line:
                    parts = line.split(":", 1)
                    key = parts[0].strip().lower().replace(" ", "_")
                    value = parts[1].strip()
                    json_data[current_section][key] = value
                # Skip empty lines
                elif line.strip():
                    # This is just text belonging to the current section
                    if "text" not in json_data[current_section]:
                        json_data[current_section]["text"] = []
                    json_data[current_section]["text"].append(line.strip())
            
            import json
            return json.dumps(json_data, indent=2)
        
        else:
            raise ValueError(f"Unsupported format type: {format_type}")


# Example usage
if __name__ == "__main__":
    document_text = """
    DISCHARGE SUMMARY
    
    Patient Information:
    Name: John Doe
    Age: 45 years
    Gender: Male
    Contact: +91 9876543210
    Aadhaar: 1234 5678 9012
    
    Diagnosis:
    Type 2 Diabetes Mellitus with Hypertension
    
    Treatment Details:
    Patient underwent diabetic management and blood pressure control protocol.
    Medications prescribed include Metformin 500mg BD, Glimepiride 1mg OD, 
    and Telmisartan 40mg OD.
    
    Follow-up:
    Review after 2 weeks with fasting blood glucose report and BP monitoring chart.
    """
    
    try:
        summarizer = MedicalDocumentSummarizer()
        
        # Generate plain text summary
        plain_summary = summarizer.summarize(document_text, document_type="discharge")
        print("PLAIN TEXT SUMMARY:")
        print(plain_summary)
        print("\n" + "-"*50 + "\n")
        
        # Generate HTML summary
        html_summary = summarizer.summarize(document_text, document_type="discharge", format_type="html")
        print("HTML SUMMARY:")
        print(html_summary)
    except Exception as e:
        print(f"Error in example usage: {str(e)}")
        print(traceback.format_exc())

Device set to use mps:0


PLAIN TEXT SUMMARY:
DISCHARGE SUMMARY

Patient: N/A
Diagnosis: 
Medications: Metformin, Glimepiride, Telmisartan
Treatment: Details:
Follow-up: 


--------------------------------------------------

HTML SUMMARY:
<div class='medical-summary'>
<h2>DISCHARGE SUMMARY
</h2>
<p><strong>Patient:</strong> N/A</p>
<p>Diagnosis: </p>
<p><strong>Medications:</strong> Metformin, Glimepiride, Telmisartan</p>
<p><strong>Treatment:</strong> Details:</p>
<p>Follow-up: </p>
</div>
