In [14]:
import PyPDF2
import re
import pandas as pd
from typing import Dict, List, Optional, Tuple
import fitz  # PyMuPDF
import json

class RobustMedicalExtractor:
    """
    Robust extractor that properly parses tabular medical data
    """
    
    def __init__(self):
        self.debug_mode = True
        
        # Normal ranges based on your reference data
        self.normal_ranges = {
            'Button Press Accuracy': {'min': 90, 'max': 100, 'unit': '%'},
            'False Alarms': {'min': 0, 'max': 5, 'unit': '%'},
            'Median Reaction Time': {'min': 400, 'max': 500, 'unit': 'ms'},
            'P50 Amplitude': {'min': 2.5, 'max': 3.0, 'unit': 'μV'},
            'P3b Amplitude': {'min': 5.5, 'max': 6.5, 'unit': 'μV'},
            'P3b Latency': {'min': 380, 'max': 420, 'unit': 'ms'},
            'Peak Alpha Frequency': {'min': 8.0, 'max': 12.0, 'unit': 'Hz'}
        }
    
    def debug_print(self, message: str, data: any = None):
        """Print debug information"""
        if self.debug_mode:
            print(f"DEBUG: {message}")
            if data is not None:
                print(f"       {data}")
    
    def extract_text_pymupdf(self, pdf_path: str) -> str:
        """Extract text using PyMuPDF with better formatting preservation"""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                # Use different extraction method to preserve table structure
                page_text = page.get_text("text")
                text += f"\n=== PAGE {page_num + 1} ===\n{page_text}\n"
            
            doc.close()
            return text
        except Exception as e:
            self.debug_print(f"PyMuPDF extraction failed: {e}")
            return ""
    
    def extract_task_performance_table(self, text: str) -> Dict:
        """Extract values from TASK PERFORMANCE table"""
        self.debug_print("Extracting TASK PERFORMANCE values...")
        
        values = {}
        
        # Look for the TASK PERFORMANCE section
        task_section_pattern = r'TASK PERFORMANCE.*?(?=ERP FEATURES|EEG FEATURES|$)'
        task_match = re.search(task_section_pattern, text, re.DOTALL | re.IGNORECASE)
        
        if task_match:
            task_section = task_match.group(0)
            self.debug_print("Found TASK PERFORMANCE section", task_section[:200] + "...")
            
            # Extract Button Press Accuracy - look for percentage value
            accuracy_patterns = [
                r'Button Press Accuracy[^\d]*(\d+\.?\d*)',
                r'Accuracy[^\d]*(\d+\.?\d*)',
                r'Button.*?Accuracy.*?(\d+\.?\d*)'
            ]
            
            for pattern in accuracy_patterns:
                match = re.search(pattern, task_section, re.IGNORECASE)
                if match:
                    values['Button Press Accuracy'] = float(match.group(1))
                    self.debug_print(f"Found Button Press Accuracy: {match.group(1)}")
                    break
            
            # Extract False Alarms
            false_alarm_patterns = [
                r'False Alarms[^\d]*(\d+\.?\d*)',
                r'False.*?Alarms.*?(\d+\.?\d*)'
            ]
            
            for pattern in false_alarm_patterns:
                match = re.search(pattern, task_section, re.IGNORECASE)
                if match:
                    values['False Alarms'] = float(match.group(1))
                    self.debug_print(f"Found False Alarms: {match.group(1)}")
                    break
            
            # Extract Median Reaction Time - look for larger numbers (reaction times are usually 400-800ms)
            reaction_patterns = [
                r'Median Reaction Time[^\d]*(\d{3,4}\.?\d*)',  # 3-4 digits for reaction time
                r'Reaction Time[^\d]*(\d{3,4}\.?\d*)',
                r'Reaction.*?Time.*?(\d{3,4}\.?\d*)'
            ]
            
            for pattern in reaction_patterns:
                match = re.search(pattern, task_section, re.IGNORECASE)
                if match:
                    values['Median Reaction Time'] = float(match.group(1))
                    self.debug_print(f"Found Median Reaction Time: {match.group(1)}")
                    break
        
        else:
            self.debug_print("TASK PERFORMANCE section not found")
        
        return values
    
    def extract_erp_table(self, text: str) -> Dict:
        """Extract values from ERP FEATURES table"""
        self.debug_print("Extracting ERP FEATURES values...")
        
        try:
            values = {}
            
            # Look for ERP FEATURES section
            erp_section_pattern = r'ERP FEATURES.*?(?=EEG FEATURES|Test Name|$)'
            self.debug_print(f"ERP regex pattern: {erp_section_pattern}")
            
            erp_match = re.search(erp_section_pattern, text, re.DOTALL | re.IGNORECASE)
            self.debug_print(f"ERP match found: {erp_match is not None}")
            
            if erp_match:
                erp_section = erp_match.group(0)
                self.debug_print("Found ERP FEATURES section", erp_section[:300] + "...")
                
                # Show the exact lines for debugging
                lines = erp_section.split('\n')
                self.debug_print("ERP section lines:")
                for i, line in enumerate(lines):
                    self.debug_print(f"  Line {i}: '{line}'")
                
                # Extract P50 amplitude - more flexible pattern for table format
                self.debug_print("Trying P50 extraction...")
                
                # Simple line-by-line approach first
                for line_num, line in enumerate(lines):
                    self.debug_print(f"Checking line {line_num}: '{line}'")
                    if 'P50' in line and 'Standard' in line:
                        self.debug_print(f"Found P50 Standard line: '{line}'")
                        # Extract numbers from the line
                        numbers = re.findall(r'([+-]?\d+\.?\d*)', line)
                        self.debug_print(f"Numbers found: {numbers}")
                        if numbers:
                            values['P50 Amplitude'] = float(numbers[0])
                            self.debug_print(f"Found P50 Amplitude (line method): {numbers[0]}")
                            break
                
                # Extract P3b amplitude and latency
                self.debug_print("Trying P3b extraction...")
                for line_num, line in enumerate(lines):
                    self.debug_print(f"Checking P3b line {line_num}: '{line}'")
                    if 'P3b' in line and 'Target' in line:
                        self.debug_print(f"Found P3b Target line: '{line}'")
                        # Extract numbers from the line
                        numbers = re.findall(r'([+-]?\d+\.?\d*)', line)
                        self.debug_print(f"Numbers found: {numbers}")
                        if len(numbers) >= 2:
                            values['P3b Amplitude'] = float(numbers[0])
                            values['P3b Latency'] = float(numbers[1])
                            self.debug_print(f"Found P3b Amplitude (line method): {numbers[0]}")
                            self.debug_print(f"Found P3b Latency (line method): {numbers[1]}")
                            break
            else:
                self.debug_print("ERP FEATURES section not found")
            
            self.debug_print(f"ERP extraction completed. Found values: {values}")
            return values
            
        except Exception as e:
            self.debug_print(f"ERROR in ERP extraction: {e}")
            import traceback
            self.debug_print(f"Traceback: {traceback.format_exc()}")
            return {}
    
    def extract_eeg_table(self, text: str) -> Dict:
        """Extract values from EEG FEATURES table"""
        self.debug_print("Extracting EEG FEATURES values...")
        
        values = {}
        
        # Look for EEG FEATURES section
        eeg_section_pattern = r'EEG FEATURES.*?(?=EEG POWER SPECTRUM|Test Name|$)'
        eeg_match = re.search(eeg_section_pattern, text, re.DOTALL | re.IGNORECASE)
        
        if eeg_match:
            eeg_section = eeg_match.group(0)
            self.debug_print("Found EEG FEATURES section", eeg_section[:200] + "...")
            
            # Extract Peak Alpha Frequency - look for Peak Alpha row
            # Pattern: Peak Alpha [frequency] [power]
            alpha_pattern = r'Peak Alpha\s+(\d+\.?\d*)'
            alpha_match = re.search(alpha_pattern, eeg_section, re.IGNORECASE)
            if alpha_match:
                values['Peak Alpha Frequency'] = float(alpha_match.group(1))
                self.debug_print(f"Found Peak Alpha Frequency: {alpha_match.group(1)}")
        else:
            self.debug_print("EEG FEATURES section not found")
        
        return values
    
    def extract_erp_table(self, text: str) -> Dict:
        """Extract values from ERP FEATURES table"""
        self.debug_print("Extracting ERP FEATURES values...")
        
        values = {}
        
        # Look for ERP FEATURES section
        erp_section_pattern = r'ERP FEATURES.*?(?=EEG FEATURES|Test Name|$)'
        erp_match = re.search(erp_section_pattern, text, re.DOTALL | re.IGNORECASE)
        
        if erp_match:
            erp_section = erp_match.group(0)
            self.debug_print("Found ERP FEATURES section", erp_section[:300] + "...")
            
    
    def extract_study_sections(self, text: str) -> Dict:
        """Extract Study Findings and Study Discussion sections"""
        sections = {}
        
        # Extract Study Findings
        findings_pattern = r'Study Findings?:?\s*(.*?)(?=Study Discussion|$)'
        findings_match = re.search(findings_pattern, text, re.DOTALL | re.IGNORECASE)
        if findings_match:
            sections['study_findings'] = findings_match.group(1).strip()
        
        # Extract Study Discussion
        discussion_pattern = r'Study Discussion:?\s*(.*?)(?=Study Protocol|Test Name|Physician|$)'
        discussion_match = re.search(discussion_pattern, text, re.DOTALL | re.IGNORECASE)
        if discussion_match:
            sections['study_discussion'] = discussion_match.group(1).strip()
        
        return sections
    
    def extract_discussion_interpretations(self, discussion_text: str) -> Dict:
        """Extract interpretations from Study Discussion section"""
        interpretations = {}
        
        patterns = {
            'Button Press Accuracy': r'Button Press Accuracy[:\s]*(Low|Normal|High)',
            'Median Reaction Time': r'Median Reaction Time[:\s]*(Delayed|Normal|Fast)',
            'P50 Amplitude': r'P50 Amplitude[:\s]*(Low|Normal|High)',
            'P3b Amplitude': r'P3b Amplitude[:\s]*(Low|Normal|High)',
            'P3b Latency': r'P3b Latency[:\s]*(Delayed|Normal|Fast)',
            'Peak Alpha Frequency': r'Peak Alpha Frequency[:\s]*(Low|Normal|High)'
        }
        
        for metric, pattern in patterns.items():
            match = re.search(pattern, discussion_text, re.IGNORECASE)
            if match:
                interpretations[metric] = match.group(1)
                self.debug_print(f"Found {metric} interpretation: {match.group(1)}")
        
        return interpretations
    
    def interpret_values(self, values: Dict) -> Dict:
        """Calculate interpretations based on normal ranges"""
        interpretations = {}
        
        for metric, value in values.items():
            if metric in self.normal_ranges:
                range_info = self.normal_ranges[metric]
                min_val = range_info['min']
                max_val = range_info['max']
                
                if metric in ['Median Reaction Time', 'P3b Latency']:
                    # For timing metrics, higher values are "Delayed"
                    if value > max_val:
                        interpretations[metric] = 'Delayed'
                    elif value < min_val:
                        interpretations[metric] = 'Fast'
                    else:
                        interpretations[metric] = 'Normal'
                else:
                    # For other metrics, use Low/Normal/High
                    if value < min_val:
                        interpretations[metric] = 'Low'
                    elif value > max_val:
                        interpretations[metric] = 'High'
                    else:
                        interpretations[metric] = 'Normal'
        
        return interpretations
    
    def process_pdf(self, pdf_path: str) -> Dict:
        """Main processing function"""
        self.debug_print(f"Processing PDF: {pdf_path}")
        
        # Extract text
        text = self.extract_text_pymupdf(pdf_path)
        if not text:
            return {"error": "Could not extract text from PDF"}
        
        # Extract sections
        sections = self.extract_study_sections(text)
        
        # Extract values from tables
        self.debug_print("About to extract task performance...")
        task_values = self.extract_task_performance_table(text)
        self.debug_print(f"Task values: {task_values}")
        
        self.debug_print("About to extract ERP features...")
        erp_values = self.extract_erp_table(text)
        self.debug_print(f"ERP values: {erp_values}")
        
        self.debug_print("About to extract EEG features...")
        eeg_values = self.extract_eeg_table(text)
        self.debug_print(f"EEG values: {eeg_values}")
        
        # Combine all values - ensure all are dictionaries
        all_values = {}
        if task_values:
            all_values.update(task_values)
        if erp_values:
            all_values.update(erp_values)
        if eeg_values:
            all_values.update(eeg_values)
        
        # Get interpretations
        discussion_interpretations = {}
        if 'study_discussion' in sections:
            discussion_interpretations = self.extract_discussion_interpretations(sections['study_discussion'])
        
        calculated_interpretations = self.interpret_values(all_values)
        
        return {
            'study_findings': sections.get('study_findings', ''),
            'study_discussion': sections.get('study_discussion', ''),
            'extracted_values': all_values,
            'discussion_interpretations': discussion_interpretations,
            'calculated_interpretations': calculated_interpretations,
            'full_text': text
        }

# Clean output function
def print_clean_results(results: Dict):
    """Print results in a clean, organized format"""
    print("\n" + "="*80)
    print("                         MEDICAL REPORT EXTRACTION RESULTS")
    print("="*80)
    
    # Study Findings
    if results.get('study_findings'):
        print("\n📋 STUDY FINDINGS:")
        print("-" * 50)
        print(results['study_findings'])
    
    # Study Discussion  
    if results.get('study_discussion'):
        print("\n💭 STUDY DISCUSSION:")
        print("-" * 50)
        print(results['study_discussion'])
    
    # Extracted Values
    extracted_values = results.get('extracted_values', {})
    if extracted_values:
        print("\n📊 EXTRACTED VALUES:")
        print("-" * 50)
        
        # Group by category
        task_metrics = ['Button Press Accuracy', 'False Alarms', 'Median Reaction Time']
        erp_metrics = ['P50 Amplitude', 'P3b Amplitude', 'P3b Latency'] 
        eeg_metrics = ['Peak Alpha Frequency']
        
        def print_category(title, metrics, values):
            category_values = {k: v for k, v in values.items() if k in metrics}
            if category_values:
                print(f"\n  {title}:")
                for metric, value in category_values.items():
                    print(f"    • {metric:<25}: {value}")
        
        print_category("Task Performance", task_metrics, extracted_values)
        print_category("ERP Features", erp_metrics, extracted_values)
        print_category("EEG Features", eeg_metrics, extracted_values)
    
    # Discussion Interpretations
    discussion_interp = results.get('discussion_interpretations', {})
    if discussion_interp:
        print("\n🔍 CLINICAL INTERPRETATIONS:")
        print("-" * 50)
        for metric, interpretation in discussion_interp.items():
            status_emoji = "🔴" if interpretation.lower() in ['low', 'delayed'] else "🟢" if interpretation.lower() == 'normal' else "🟡"
            print(f"  {status_emoji} {metric:<25}: {interpretation}")
    
    # Calculated Interpretations
    calc_interp = results.get('calculated_interpretations', {})
    if calc_interp:
        print("\n🧮 CALCULATED INTERPRETATIONS (Based on Normal Ranges):")
        print("-" * 50)
        for metric, interpretation in calc_interp.items():
            status_emoji = "🔴" if interpretation.lower() in ['low', 'delayed'] else "🟢" if interpretation.lower() == 'normal' else "🟡"
            print(f"  {status_emoji} {metric:<25}: {interpretation}")
    
    print("\n" + "="*80)

# Usage example with clean output
def main():
    extractor = RobustMedicalExtractor()
    extractor.debug_mode = True  # Turn ON debug to see what's happening
    
    # Process PDF
    pdf_path = "Patient_30627.pdf"  # Replace with your PDF path
    
    try:
        results = extractor.process_pdf(pdf_path)
        
        # Print clean results
        print_clean_results(results)
        
        # Create summary DataFrame
        summary_data = []
        extracted_values = results.get('extracted_values', {})
        discussion_interp = results.get('discussion_interpretations', {})
        calc_interp = results.get('calculated_interpretations', {})
        
        all_metrics = set(extracted_values.keys()) | set(discussion_interp.keys()) | set(calc_interp.keys())
        
        for metric in sorted(all_metrics):
            summary_data.append({
                'Metric': metric,
                'Value': extracted_values.get(metric, 'Not found'),
                'Clinical_Interpretation': discussion_interp.get(metric, 'Not found'),
                'Calculated_Interpretation': calc_interp.get(metric, 'Not found')
            })
        
        summary_df = pd.DataFrame(summary_data)
        
        # Save to CSV
        summary_df.to_csv('medical_report_summary.csv', index=False)
        print(f"\n💾 Summary saved to 'medical_report_summary.csv'")
        
        return results
        
    except Exception as e:
        print(f"❌ Error processing PDF: {e}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    main()

DEBUG: Processing PDF: Patient_30627.pdf
DEBUG: About to extract task performance...
DEBUG: Extracting TASK PERFORMANCE values...
DEBUG: Found TASK PERFORMANCE section
       TASK PERFORMANCE
Feature
Value
Button Press Accuracy (%)
75.0
False Alarms (%)
0.6
Median Reaction Time (ms)
576.0
...
DEBUG: Found Button Press Accuracy: 75.0
DEBUG: Found False Alarms: 0.6
DEBUG: Found Median Reaction Time: 576.0
DEBUG: Task values: {'Button Press Accuracy': 75.0, 'False Alarms': 0.6, 'Median Reaction Time': 576.0}
DEBUG: About to extract ERP features...
DEBUG: Extracting ERP FEATURES values...
DEBUG: Found ERP FEATURES section
       ERP FEATURES
Feature
Stimulus
Amplitude (μV)
Latency (ms)
Avg Amplitude (μV)
P50
Standard
-0.13
50.3
-1.05
N100
Standard
-5.81
102.7
-4.05
P200
Standard
2.82
202.3
1.74
N200
Target
-2.06
272.0
0.61
P3b
Target
5.31
442.0
2.92
SW
Target
0.70
588.6
2.17
P3a
Distractor
3.64
390.9
1.12
...
DEBUG: ERP values: None
DEBUG: About to extract EEG features...
DEBUG: Extracting

In [18]:
import fitz  # PyMuPDF
import re
import pandas as pd
from typing import Dict

class SimpleMedicalExtractor:
    """
    Simplified medical report extractor that actually works!
    """
    
    def __init__(self):
        # Normal ranges for interpretation
        self.normal_ranges = {
            'Button Press Accuracy': {'min': 90, 'max': 100},
            'False Alarms': {'min': 0, 'max': 5},
            'Median Reaction Time': {'min': 400, 'max': 500},
            'P50 Amplitude': {'min': 2.5, 'max': 3.0},
            'P3b Amplitude': {'min': 5.5, 'max': 6.5},
            'P3b Latency': {'min': 380, 'max': 420},
            'Peak Alpha Frequency': {'min': 8.0, 'max': 12.0}
        }
    
    def extract_pdf_text(self, pdf_path: str) -> str:
        """Extract text from PDF"""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text() + "\n"
            doc.close()
            return text
        except Exception as e:
            print(f"Error reading PDF: {e}")
            return ""
    
    def generate_study_findings(self, values: Dict, interpretations: Dict) -> str:
        """Generate Study Findings based on extracted values and interpretations"""
        
        # Count abnormal findings
        abnormal_findings = []
        abnormal_count = 0
        
        for metric, interpretation in interpretations.items():
            if interpretation.lower() in ['low', 'delayed', 'high']:
                abnormal_findings.append(f"{interpretation.lower()} {metric.lower()}")
                abnormal_count += 1
        
        # Generate findings text
        if abnormal_count == 0:
            findings = "This is a normal study with all measured parameters within expected ranges."
        else:
            if abnormal_count == 1:
                findings = f"This is an abnormal study due to {abnormal_findings[0]}."
            elif abnormal_count == 2:
                findings = f"This is an abnormal study due to {abnormal_findings[0]} and {abnormal_findings[1]}."
            else:
                findings = f"This is an abnormal study due to {', '.join(abnormal_findings[:-1])}, and {abnormal_findings[-1]}."
            
            # Add clinical implications based on specific findings
            implications = []
            
            # Check for cognitive/attention issues
            cognitive_issues = any(metric in ['Button Press Accuracy', 'Median Reaction Time', 'P3b Latency'] 
                                 and interpretation.lower() in ['low', 'delayed'] 
                                 for metric, interpretation in interpretations.items())
            
            if cognitive_issues:
                implications.append("reduced stimulus processing (including evaluation and classification speed)")
                implications.append("reduced attentional resources and executive function")
            
            # Check for sensory processing issues
            sensory_issues = any(metric in ['P50 Amplitude'] 
                               and interpretation.lower() in ['low', 'high'] 
                               for metric, interpretation in interpretations.items())
            
            if sensory_issues:
                implications.append("altered sensory gating and filtering")
            
            # Check for alpha rhythm issues
            alpha_issues = any(metric in ['Peak Alpha Frequency'] 
                             and interpretation.lower() in ['low', 'high'] 
                             for metric, interpretation in interpretations.items())
            
            if alpha_issues:
                implications.append("altered cortical arousal and attention networks")
            
            if implications:
                findings += f" Collectively, study findings suggest {', and '.join(implications)}."
                
                # Add risk assessment
                if cognitive_issues:
                    findings += " These findings suggest increased risk of cognitive dysfunction and may warrant clinical correlation."
        
        return findings
    
    def generate_study_discussion(self, values: Dict, interpretations: Dict) -> str:
        """Generate Study Discussion based on extracted values and interpretations"""
        
        discussion_lines = []
        
        # Add each metric with its interpretation
        metric_order = [
            'Button Press Accuracy',
            'Median Reaction Time', 
            'P50 Amplitude',
            'P3b Amplitude',
            'P3b Latency',
            'Peak Alpha Frequency'
        ]
        
        for metric in metric_order:
            if metric in interpretations:
                interpretation = interpretations[metric]
                discussion_lines.append(f"{metric}: {interpretation}")
        
        # Add any additional metrics not in the ordered list
        for metric, interpretation in interpretations.items():
            if metric not in metric_order:
                discussion_lines.append(f"{metric}: {interpretation}")
        
        # Add detailed explanations for abnormal findings
        explanations = []
        
        for metric, interpretation in interpretations.items():
            if interpretation.lower() == 'low':
                if metric == 'Button Press Accuracy':
                    explanations.append("Decreased button press accuracy reflects subjects' reduced ability to pay attention to test stimuli, directly correlated with executive function deficits.")
                elif metric == 'P50 Amplitude':
                    explanations.append("Decreased P50 amplitude may reflect impaired sensory gating and pre-attentive processing.")
                elif metric == 'P3b Amplitude':
                    explanations.append("Decreased P3b amplitude reflects reduced attentional resources allocated to target stimulus processing.")
            
            elif interpretation.lower() == 'delayed':
                if metric == 'Median Reaction Time':
                    explanations.append("Prolonged reaction time reflects slower cognitive processing and may indicate executive dysfunction.")
                elif metric == 'P3b Latency':
                    explanations.append("Delayed P3b latency indicates slower stimulus evaluation and classification processes, often associated with cognitive slowing.")
            
            elif interpretation.lower() == 'high':
                if metric == 'Peak Alpha Frequency':
                    explanations.append("Elevated alpha frequency may reflect heightened cortical arousal or compensatory mechanisms.")
                elif metric == 'P50 Amplitude':
                    explanations.append("Increased P50 amplitude may indicate sensory hypersensitivity or reduced inhibitory control.")
        
        # Combine discussion and explanations
        discussion = '\n'.join(discussion_lines)
        if explanations:
            discussion += '\n\nDetailed Analysis:\n' + '\n'.join(explanations)
        
        return discussion
    
    def extract_all_values(self, text: str) -> Dict:
        """Extract all medical values using the working line-by-line approach"""
        
        values = {}
        lines = [line.strip() for line in text.split('\n')]
        
        # Extract Task Performance values
        for i, line in enumerate(lines):
            # Button Press Accuracy - look for the pattern
            if 'Button Press Accuracy' in line and i+1 < len(lines):
                try:
                    # Next line should have the value
                    value = float(lines[i+1])
                    values['Button Press Accuracy'] = value
                except ValueError:
                    pass
            
            # False Alarms
            if 'False Alarms' in line and i+1 < len(lines):
                try:
                    value = float(lines[i+1])
                    values['False Alarms'] = value
                except ValueError:
                    pass
            
            # Median Reaction Time
            if 'Median Reaction Time' in line and i+1 < len(lines):
                try:
                    value = float(lines[i+1])
                    values['Median Reaction Time'] = value
                except ValueError:
                    pass
        
        # Extract ERP values using the working approach
        for i in range(len(lines) - 5):
            # P50 Amplitude
            if lines[i] == 'P50' and i+1 < len(lines) and lines[i+1] == 'Standard':
                if i+2 < len(lines):
                    try:
                        values['P50 Amplitude'] = float(lines[i+2])
                    except ValueError:
                        pass
            
            # P3b Amplitude and Latency
            if lines[i] == 'P3b' and i+1 < len(lines) and lines[i+1] == 'Target':
                if i+2 < len(lines) and i+3 < len(lines):
                    try:
                        values['P3b Amplitude'] = float(lines[i+2])
                        values['P3b Latency'] = float(lines[i+3])
                    except ValueError:
                        pass
            
            # Peak Alpha Frequency
            if lines[i] == 'Peak Alpha' and i+1 < len(lines):
                try:
                    values['Peak Alpha Frequency'] = float(lines[i+1])
                except ValueError:
                    pass
        
        return values
    
    def extract_discussion_interpretations(self, discussion_text: str) -> Dict:
        """Extract interpretations from Study Discussion (if needed for comparison)"""
        interpretations = {}
        
        patterns = {
            'Button Press Accuracy': r'Button Press Accuracy[:\s]*(Low|Normal|High)',
            'Median Reaction Time': r'Median Reaction Time[:\s]*(Delayed|Normal|Fast)',
            'P50 Amplitude': r'P50 Amplitude[:\s]*(Low|Normal|High)',
            'P3b Amplitude': r'P3b Amplitude[:\s]*(Low|Normal|High)',
            'P3b Latency': r'P3b Latency[:\s]*(Delayed|Normal|Fast)',
            'Peak Alpha Frequency': r'Peak Alpha Frequency[:\s]*(Low|Normal|High)'
        }
        
        for metric, pattern in patterns.items():
            match = re.search(pattern, discussion_text, re.IGNORECASE)
            if match:
                interpretations[metric] = match.group(1)
        
        return interpretations
    
    def calculate_interpretations(self, values: Dict) -> Dict:
        """Calculate interpretations based on normal ranges"""
        interpretations = {}
        
        for metric, value in values.items():
            if metric in self.normal_ranges:
                range_info = self.normal_ranges[metric]
                min_val = range_info['min']
                max_val = range_info['max']
                
                if metric in ['Median Reaction Time', 'P3b Latency']:
                    # For timing metrics, higher = delayed
                    if value > max_val:
                        interpretations[metric] = 'Delayed'
                    elif value < min_val:
                        interpretations[metric] = 'Fast'
                    else:
                        interpretations[metric] = 'Normal'
                else:
                    # For other metrics
                    if value < min_val:
                        interpretations[metric] = 'Low'
                    elif value > max_val:
                        interpretations[metric] = 'High'
                    else:
                        interpretations[metric] = 'Normal'
        
        return interpretations
    
    def process_pdf(self, pdf_path: str) -> Dict:
        """Main processing function"""
        print(f"Processing: {pdf_path}")
        
        # Extract text
        text = self.extract_pdf_text(pdf_path)
        if not text:
            return {"error": "Could not extract text from PDF"}
        
        # Extract values and calculate interpretations
        values = self.extract_all_values(text)
        calculated_interpretations = self.calculate_interpretations(values)
        
        # Generate our own study findings and discussion based on the data
        generated_findings = self.generate_study_findings(values, calculated_interpretations)
        generated_discussion = self.generate_study_discussion(values, calculated_interpretations)
        
        # Optional: Extract original interpretations for comparison (but don't use as primary)
        original_discussion_text = ""
        discussion_pattern = r'Study Discussion:?\s*(.*?)(?=Study Protocol|Test Name|Physician|$)'
        discussion_match = re.search(discussion_pattern, text, re.DOTALL | re.IGNORECASE)
        if discussion_match:
            original_discussion_text = discussion_match.group(1).strip()
        
        original_interpretations = self.extract_discussion_interpretations(original_discussion_text)
        
        return {
            'generated_study_findings': generated_findings,
            'generated_study_discussion': generated_discussion,
            'extracted_values': values,
            'our_interpretations': calculated_interpretations,
            'original_interpretations': original_interpretations,  # For comparison only
        }
    
    def print_results(self, results: Dict):
        """Print results in a clean format"""
        print("\n" + "="*80)
        print("                    MEDICAL REPORT ANALYSIS RESULTS")
        print("="*80)
        
        # Generated Study Findings (our analysis)
        if results.get('generated_study_findings'):
            print("\n📋 GENERATED STUDY FINDINGS (Based on Data Analysis):")
            print("-" * 60)
            print(results['generated_study_findings'])
        
        # Generated Study Discussion (our analysis)
        if results.get('generated_study_discussion'):
            print("\n💭 GENERATED STUDY DISCUSSION (Based on Data Analysis):")
            print("-" * 60)
            print(results['generated_study_discussion'])
        
        # Extracted Values
        values = results.get('extracted_values', {})
        if values:
            print("\n📊 EXTRACTED VALUES:")
            print("-" * 50)
            
            # Group by category
            task_metrics = ['Button Press Accuracy', 'False Alarms', 'Median Reaction Time']
            erp_metrics = ['P50 Amplitude', 'P3b Amplitude', 'P3b Latency']
            eeg_metrics = ['Peak Alpha Frequency']
            
            def print_category(title, metrics):
                category_values = {k: v for k, v in values.items() if k in metrics}
                if category_values:
                    print(f"\n  {title}:")
                    for metric, value in category_values.items():
                        # Show value with interpretation
                        interpretation = results.get('our_interpretations', {}).get(metric, 'Unknown')
                        emoji = "🔴" if interpretation.lower() in ['low', 'delayed'] else "🟢" if interpretation.lower() == 'normal' else "🟡"
                        print(f"    {emoji} {metric:<25}: {value} ({interpretation})")
            
            print_category("Task Performance", task_metrics)
            print_category("ERP Features", erp_metrics)
            print_category("EEG Features", eeg_metrics)
        
        # Our Interpretations
        our_interp = results.get('our_interpretations', {})
        if our_interp:
            print("\n🧮 OUR INTERPRETATIONS (Based on Normal Ranges):")
            print("-" * 55)
            for metric, interpretation in our_interp.items():
                emoji = "🔴" if interpretation.lower() in ['low', 'delayed'] else "🟢" if interpretation.lower() == 'normal' else "🟡"
                print(f"  {emoji} {metric:<25}: {interpretation}")
        
        # Original interpretations (for comparison only)
        original_interp = results.get('original_interpretations', {})
        if original_interp:
            print("\n📄 ORIGINAL REPORT INTERPRETATIONS (For Comparison):")
            print("-" * 55)
            for metric, interpretation in original_interp.items():
                emoji = "🔴" if interpretation.lower() in ['low', 'delayed'] else "🟢" if interpretation.lower() == 'normal' else "🟡"
                print(f"  {emoji} {metric:<25}: {interpretation}")
        
        print("\n" + "="*80)
    
    def save_to_csv(self, results: Dict, output_file: str = 'medical_report_analysis.csv'):
        """Save results to CSV"""
        data = []
        
        values = results.get('extracted_values', {})
        our_interp = results.get('our_interpretations', {})
        original_interp = results.get('original_interpretations', {})
        
        all_metrics = set(values.keys()) | set(our_interp.keys()) | set(original_interp.keys())
        
        for metric in sorted(all_metrics):
            data.append({
                'Metric': metric,
                'Value': values.get(metric, 'Not found'),
                'Our_Interpretation': our_interp.get(metric, 'Not found'),
                'Original_Interpretation': original_interp.get(metric, 'Not found'),
                'Match': 'Yes' if our_interp.get(metric) == original_interp.get(metric) else 'No'
            })
        
        if results.get('generated_study_findings'):
            data.append({
                'Metric': 'GENERATED_STUDY_FINDINGS',
                'Value': results['generated_study_findings'],
                'Our_Interpretation': 'Generated from data',
                'Original_Interpretation': 'N/A',
                'Match': 'N/A'
            })
        
        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False)
        print(f"💾 Results saved to {output_file}")


# Usage examples
def process_single_pdf(pdf_path: str):
    """Process a single PDF file"""
    extractor = SimpleMedicalExtractor()
    
    try:
        results = extractor.process_pdf(pdf_path)
        extractor.print_results(results)
        extractor.save_to_csv(results)
        return results
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

def process_multiple_pdfs(pdf_paths: list):
    """Process multiple PDF files"""
    extractor = SimpleMedicalExtractor()
    all_results = {}
    
    for pdf_path in pdf_paths:
        try:
            print(f"\n{'='*60}")
            print(f"Processing: {pdf_path}")
            print('='*60)
            
            results = extractor.process_pdf(pdf_path)
            extractor.print_results(results)
            
            # Save individual CSV
            csv_name = pdf_path.replace('.pdf', '_summary.csv')
            extractor.save_to_csv(results, csv_name)
            
            all_results[pdf_path] = results
            
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
            all_results[pdf_path] = {"error": str(e)}
    
    return all_results

if __name__ == "__main__":
    # Process single PDF
    pdf_file = "Patient_30627.pdf"  # Replace with your PDF path
    
    print("🔬 Medical PDF Extractor - Simplified Version")
    print("=" * 60)
    
    results = process_single_pdf(pdf_file)
    
    # Uncomment below to process multiple PDFs
    # pdf_files = ["Patient_30627.pdf", "Patient_12345.pdf", "Patient_67890.pdf"]
    # all_results = process_multiple_pdfs(pdf_files)

🔬 Medical PDF Extractor - Simplified Version
Processing: Patient_30627.pdf

                    MEDICAL REPORT ANALYSIS RESULTS

📋 GENERATED STUDY FINDINGS (Based on Data Analysis):
------------------------------------------------------------
This is an abnormal study due to low button press accuracy, delayed median reaction time, low p50 amplitude, low p3b amplitude, and delayed p3b latency. Collectively, study findings suggest reduced stimulus processing (including evaluation and classification speed), and reduced attentional resources and executive function, and altered sensory gating and filtering. These findings suggest increased risk of cognitive dysfunction and may warrant clinical correlation.

💭 GENERATED STUDY DISCUSSION (Based on Data Analysis):
------------------------------------------------------------
Button Press Accuracy: Low
Median Reaction Time: Delayed
P50 Amplitude: Low
P3b Amplitude: Low
P3b Latency: Delayed
Peak Alpha Frequency: Normal
False Alarms: Normal

Detai