# Final Analysis Framework v2.0 - Focused Results Analysis

This analysis framework focuses on understanding the experimental results from the construction invoice processing study, incorporating controlled experimental design considerations and practical system improvement insights.


## Setup


### Requirements Installation and Verification


In [1]:
import subprocess
import sys
import os
from pathlib import Path
import time

def find_requirements_file():
    """Find the requirements_analysis.txt file in expected locations."""
    requirements_paths = [
        Path('./requirements/requirements_analysis.txt'),
        Path('../requirements/requirements_analysis.txt'),
        Path('../../requirements/requirements_analysis.txt'),
        Path('./Deliverables-Code/requirements/requirements_analysis.txt')
    ]
    
    for path in requirements_paths:
        if path.exists():
            return path
    
    return None

def read_requirements_file(requirements_file):
    """Read and parse the requirements file."""
    try:
        with open(requirements_file, 'r') as f:
            requirements_content = f.read().strip().split('\n')
        
        # Filter out comments and empty lines
        requirements_list = [
            req.strip() for req in requirements_content 
            if req.strip() and not req.strip().startswith('#')
        ]
        
        return requirements_list
    except Exception as e:
        print(f"❌ Error reading requirements file: {e}")
        return []

def install_package(requirement, index, total):
    """Install a single package with progress reporting."""
    package_name = requirement.split('>=')[0].split('==')[0].split('[')[0]
    print(f"\n[{index}/{total}] Installing {package_name}...")
    print(f"   Full requirement: {requirement}")
    
    start_time = time.time()
    try:
        result = subprocess.run([
            sys.executable, '-m', 'pip', 'install', requirement, '--timeout', '120'
        ], capture_output=True, text=True, timeout=180)  # 3 minute timeout per package
        
        elapsed_time = time.time() - start_time
        
        if result.returncode == 0:
            print(f"   ✅ {package_name} installed successfully ({elapsed_time:.1f}s)")
            return True, package_name
        else:
            print(f"   ❌ Failed to install {package_name}")
            if result.stderr:
                print(f"   Error: {result.stderr[:200]}...")
            return False, package_name
            
    except subprocess.TimeoutExpired:
        print(f"   ⏰ Timeout installing {package_name} (>3 minutes)")
        return False, package_name
    except Exception as e:
        print(f"   ❌ Exception installing {package_name}: {e}")
        return False, package_name

def install_requirements():
    """Install all requirements from the requirements file."""
    # Find requirements file
    requirements_file = find_requirements_file()
    if not requirements_file:
        print("❌ requirements_analysis.txt not found in any expected location")
        print("Expected locations:")
        for path in [Path('./requirements/requirements_analysis.txt'),
                    Path('../requirements/requirements_analysis.txt'),
                    Path('../../requirements/requirements_analysis.txt'),
                    Path('./Deliverables-Code/requirements/requirements_analysis.txt')]:
            print(f"   - {path}")
        return False
    
    print(f"✓ Found requirements file: {requirements_file}")
    
    # Read requirements
    requirements_list = read_requirements_file(requirements_file)
    if not requirements_list:
        print("❌ No valid requirements found in file")
        return False
    
    print(f"\n📦 Installing {len(requirements_list)} packages from requirements file...")
    print("=" * 60)
    
    # Install each requirement
    failed_packages = []
    successful_packages = []
    
    for i, requirement in enumerate(requirements_list, 1):
        success, package_name = install_package(requirement, i, len(requirements_list))
        if success:
            successful_packages.append(package_name)
        else:
            failed_packages.append(package_name)
    
    # Summary
    print("\n" + "=" * 60)
    print(f"📊 Installation Summary:")
    print(f"   ✅ Successful: {len(successful_packages)}")
    print(f"   ❌ Failed: {len(failed_packages)}")
    
    if successful_packages:
        print(f"\n   Successfully installed: {', '.join(successful_packages[:5])}")
        if len(successful_packages) > 5:
            print(f"   ... and {len(successful_packages) - 5} more")
    
    if failed_packages:
        print(f"\n   ⚠️  Failed packages: {', '.join(failed_packages)}")
        print("   You may need to install these manually or check for dependency conflicts.")
    
    return len(failed_packages) == 0

def check_package_installed(package_name):
    """Check if a package is already installed."""
    try:
        result = subprocess.run([
            sys.executable, '-m', 'pip', 'show', package_name
        ], capture_output=True, text=True)
        return result.returncode == 0
    except:
        return False

def show_pre_installation_status():
    """Show which key packages are already installed."""
    check_packages = ['pandas', 'numpy', 'matplotlib', 'seaborn', 'scipy', 
                     'scikit-learn', 'statsmodels', 'tqdm', 'pyyaml']
    
    print("🔍 Checking current package status...")
    print("-" * 50)
    
    already_installed = []
    need_installation = []
    
    for package in check_packages:
        if check_package_installed(package):
            print(f"✅ {package:<15} - Already installed")
            already_installed.append(package)
        else:
            print(f"❌ {package:<15} - Needs installation")
            need_installation.append(package)
    
    print(f"\n📊 Status Summary:")
    print(f"   Already installed: {len(already_installed)}")
    print(f"   Need installation: {len(need_installation)}")
    
    if need_installation:
        print(f"   Packages to install: {', '.join(need_installation)}")
    
    return already_installed, need_installation

def verify_imports():
    """Verify that key libraries can be imported."""
    required_libraries = {
        'pandas': 'pd',
        'numpy': 'np', 
        'matplotlib.pyplot': 'plt',
        'seaborn': 'sns',
        'scipy': 'scipy',
        'pathlib': 'pathlib',
        'json': 'json',
        'yaml': 'yaml',
        'sklearn': 'sklearn',
        'statsmodels.api': 'sm'
    }
    
    print("\n🔍 Verifying library imports...")
    print("-" * 40)
    failed_imports = []
    
    for lib, alias in required_libraries.items():
        try:
            __import__(lib)
            print(f"✅ {lib:<20} - OK")
        except ImportError as e:
            print(f"❌ {lib:<20} - FAILED: {str(e)[:50]}...")
            failed_imports.append(lib)
    
    if failed_imports:
        print(f"\n⚠️  Warning: {len(failed_imports)} libraries failed to import")
        print("   Failed libraries:", ', '.join(failed_imports))
        print("   You may need to restart the kernel after installation")
    else:
        print("\n✅ All required libraries verified successfully")
    
    return len(failed_imports) == 0

# Run installation and verification with progress tracking
print("=" * 70)
print("🚀 Analysis Requirements Installation & Verification")
print("=" * 70)

# Show pre-installation status
already_installed, need_installation = show_pre_installation_status()

# Proceed with installation
print(f"\n⏱️  Starting installation process at {time.strftime('%H:%M:%S')}")
start_total = time.time()

installation_success = install_requirements()

total_time = time.time() - start_total
print(f"\n⏱️  Total installation time: {total_time:.1f} seconds")

# Verify installation
verification_success = verify_imports()

print("\n" + "=" * 70)
if installation_success and verification_success:
    print("🎉 Setup complete! Ready to proceed with analysis.")
elif installation_success:
    print("⚠️  Installation complete but some imports failed. You may need to restart the kernel.")
else:
    print("❌ Installation had issues. Please check the error messages above.")
print("=" * 70)

🚀 Analysis Requirements Installation & Verification
🔍 Checking current package status...
--------------------------------------------------
✅ pandas          - Already installed
✅ numpy           - Already installed
✅ matplotlib      - Already installed
✅ seaborn         - Already installed
✅ scipy           - Already installed
✅ scikit-learn    - Already installed
✅ statsmodels     - Already installed
✅ tqdm            - Already installed
✅ pyyaml          - Already installed

📊 Status Summary:
   Already installed: 9
   Need installation: 0

⏱️  Starting installation process at 13:37:10
✓ Found requirements file: ..\requirements\requirements_analysis.txt

📦 Installing 13 packages from requirements file...

[1/13] Installing pandas...
   Full requirement: pandas>=1.5.0
   ✅ pandas installed successfully (1.5s)

[2/13] Installing numpy...
   Full requirement: numpy>=1.21.0
   ✅ numpy installed successfully (1.6s)

[3/13] Installing matplotlib...
   Full requirement: matplotlib>=3.5.0
 

### Root Directory Detection


In [2]:
def find_project_root():
    """
    Find project root by locating directory containing .gitignore and .gitattributes.
    Similar to implementation in 03_pixtral_model.py
    """
    from pathlib import Path
    import sys
    
    try:
        # When running as a script, start from script location
        start_path = Path(__file__).parent
    except NameError:
        # When running in a notebook, start from current working directory
        start_path = Path.cwd()
    
    # Walk up the directory tree to find git markers
    current_path = start_path
    while current_path != current_path.parent:  # Stop at filesystem root
        if (current_path / ".gitignore").exists() and (current_path / ".gitattributes").exists():
            return current_path
        current_path = current_path.parent
    
    raise RuntimeError("Could not find project root (directory containing .gitignore and .gitattributes)")

def setup_project_paths():
    """Set up all project directory paths and verify they exist."""
    global ROOT_DIR, DELIVERABLES_DIR, DATA_DIR, RESULTS_DIR, ANALYSIS_DIR, CONFIG_DIR
    
    # Find and set root directory
    ROOT_DIR = find_project_root()
    print(f"✓ Found project root: {ROOT_DIR}")
    
    # Set up key directories
    DELIVERABLES_DIR = ROOT_DIR / "Deliverables-Code"
    DATA_DIR = DELIVERABLES_DIR / "data"
    RESULTS_DIR = DELIVERABLES_DIR / "results"
    ANALYSIS_DIR = DELIVERABLES_DIR / "analysis"
    CONFIG_DIR = DELIVERABLES_DIR / "config"
    
    # Verify expected directories exist
    required_dirs = {
        "Deliverables-Code": DELIVERABLES_DIR,
        "data": DATA_DIR,
        "results": RESULTS_DIR,
        "analysis": ANALYSIS_DIR,
        "config": CONFIG_DIR
    }
    
    missing_dirs = []
    for name, path in required_dirs.items():
        if path.exists():
            print(f"✓ Found {name} directory: {path}")
        else:
            print(f"⚠ Missing {name} directory: {path}")
            missing_dirs.append(name)
    
    if missing_dirs:
        print(f"\n⚠ Warning: {len(missing_dirs)} required directories not found")
        print("This may indicate the notebook is being run from an unexpected location")
    else:
        print("\n✓ All project directories located successfully")
    
    # Create analysis directory if it doesn't exist
    ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)
    
    # Add project root to Python path for imports
    import sys
    if str(ROOT_DIR) not in sys.path:
        sys.path.append(str(ROOT_DIR))
        print(f"✓ Added project root to Python path")
    
    return ROOT_DIR

def display_project_structure():
    """Display relevant project structure for reference."""
    print("\n=== Project Structure (Key Directories) ===")
    print(f"ROOT_DIR:         {ROOT_DIR}")
    print(f"DELIVERABLES_DIR: {DELIVERABLES_DIR}")
    print(f"DATA_DIR:         {DATA_DIR}")
    print(f"RESULTS_DIR:      {RESULTS_DIR}")
    print(f"ANALYSIS_DIR:     {ANALYSIS_DIR}")
    print(f"CONFIG_DIR:       {CONFIG_DIR}")
    
    # Show counts of files in key directories
    if RESULTS_DIR.exists():
        result_files = list(RESULTS_DIR.glob("*.json"))
        print(f"\nResult files found: {len(result_files)}")
        
    if ANALYSIS_DIR.exists():
        analysis_files = list(ANALYSIS_DIR.glob("*.json"))
        print(f"Analysis files found: {len(analysis_files)}")
        
    if (DATA_DIR / "images" / "metadata").exists():
        metadata_files = list((DATA_DIR / "images" / "metadata").glob("*.csv"))
        print(f"Metadata files found: {len(metadata_files)}")

# Run root directory detection and path setup
print("=== Root Directory Detection & Path Setup ===")
project_root = setup_project_paths()
display_project_structure()

print(f"\n🎯 Ready to proceed with analysis from: {ROOT_DIR.name}")

=== Root Directory Detection & Path Setup ===
✓ Found project root: d:\UCSD_MJM
✓ Found Deliverables-Code directory: d:\UCSD_MJM\Deliverables-Code
✓ Found data directory: d:\UCSD_MJM\Deliverables-Code\data
✓ Found results directory: d:\UCSD_MJM\Deliverables-Code\results
✓ Found analysis directory: d:\UCSD_MJM\Deliverables-Code\analysis
✓ Found config directory: d:\UCSD_MJM\Deliverables-Code\config

✓ All project directories located successfully
✓ Added project root to Python path

=== Project Structure (Key Directories) ===
ROOT_DIR:         d:\UCSD_MJM
DELIVERABLES_DIR: d:\UCSD_MJM\Deliverables-Code
DATA_DIR:         d:\UCSD_MJM\Deliverables-Code\data
RESULTS_DIR:      d:\UCSD_MJM\Deliverables-Code\results
ANALYSIS_DIR:     d:\UCSD_MJM\Deliverables-Code\analysis
CONFIG_DIR:       d:\UCSD_MJM\Deliverables-Code\config

Result files found: 15
Analysis files found: 15
Metadata files found: 2

🎯 Ready to proceed with analysis from: UCSD_MJM


### Import Libraries


In [3]:
# Import standard libraries for data analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, mannwhitneyu, kruskal
import json
import yaml
from pathlib import Path
import warnings
from datetime import datetime
from typing import Dict, List, Tuple, Any, Optional
import re
from collections import defaultdict, Counter
import itertools

# Statistical and machine learning utilities
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Progress tracking
from tqdm import tqdm

# Configure plotting parameters and styles
plt.style.use('default')  # Start with clean default style

# Set up matplotlib and seaborn styling
plt.rcParams.update({
    'figure.figsize': (12, 8),
    'figure.dpi': 100,
    'font.size': 11,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'legend.title_fontsize': 11,
    'axes.grid': True,
    'grid.alpha': 0.3,
    'lines.linewidth': 2,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white'
})

# Set seaborn style and palette
sns.set_style("whitegrid")
sns.set_palette("husl")

# Custom color palette for consistent visualization
ANALYSIS_COLORS = {
    'LMM': '#2E86AB',        # Blue for LMM models
    'OCR': '#A23B72',        # Purple for OCR models
    'Pixtral': '#2E86AB',    # Blue for Pixtral
    'Llama': '#00A6D6',      # Light blue for Llama
    'DocTR': '#A23B72',      # Purple for DocTR
    'accuracy': '#28A745',    # Green for accuracy metrics
    'cer': '#DC3545',        # Red for error metrics
    'work_order': '#FD7E14',  # Orange for work order
    'total_cost': '#6F42C1',  # Purple for total cost
    'baseline': '#6C757D',    # Gray for baseline/reference
    'improvement': '#20C997'   # Teal for improvements
}

# Configure warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Display configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("✓ All libraries imported successfully")
print("✓ Plotting parameters configured")
print("✓ Custom color palette defined")
print("✓ Analysis environment ready")

# Show available color palette
print(f"\n📊 Available analysis colors: {list(ANALYSIS_COLORS.keys())}")
print("🎨 Visualization settings optimized for analysis reports")

✓ All libraries imported successfully
✓ Plotting parameters configured
✓ Custom color palette defined
✓ Analysis environment ready

📊 Available analysis colors: ['LMM', 'OCR', 'Pixtral', 'Llama', 'DocTR', 'accuracy', 'cer', 'work_order', 'total_cost', 'baseline', 'improvement']
🎨 Visualization settings optimized for analysis reports


### Data Loading Functions


In [6]:
def load_ground_truth_data(ground_truth_file: str = None) -> pd.DataFrame:
    """Load and validate ground truth CSV data."""
    # Set default ground truth file path using ROOT_DIR
    if ground_truth_file is None:
        ground_truth_file = DATA_DIR / "images" / "metadata" / "ground_truth.csv"
    else:
        ground_truth_file = Path(ground_truth_file)
    
    if not ground_truth_file.exists():
        raise FileNotFoundError(f"Ground truth file not found: {ground_truth_file}")
    
    try:
        # Load with explicit string type for filename column to ensure consistent matching
        ground_truth = pd.read_csv(ground_truth_file, dtype={'filename': str})
        
        # Validate required columns
        required_columns = {'filename', 'work_order_number', 'total'}
        missing_columns = required_columns - set(ground_truth.columns)
        if missing_columns:
            raise ValueError(f"Missing required columns in ground truth: {missing_columns}")
        
        # Clean and validate data
        ground_truth['filename'] = ground_truth['filename'].str.strip()
        ground_truth['work_order_number'] = ground_truth['work_order_number'].astype(str).str.strip()
        
        print(f"INFO: Loaded ground truth data: {len(ground_truth)} records")
        return ground_truth
        
    except Exception as e:
        print(f"ERROR: Error loading ground truth data: {e}")
        raise

def discover_results_files() -> Dict[str, List[Path]]:
    """Discover all results files organized by model type."""
    print("INFO: Discovering results files")
    
    results_files = {
        'pixtral': [],
        'llama': [],
        'doctr': [],
        'all': []
    }
    
    # Get all results JSON files
    all_files = list(RESULTS_DIR.glob("results-*.json"))
    
    for file in all_files:
        results_files['all'].append(file)
        
        # Categorize by model type based on filename pattern
        if 'pixtral' in file.name:
            results_files['pixtral'].append(file)
        elif 'llama' in file.name:
            results_files['llama'].append(file)
        elif 'doctr' in file.name:
            results_files['doctr'].append(file)
    
    # Sort files by modification time (newest first)
    for model_type in results_files:
        results_files[model_type].sort(key=lambda x: x.stat().st_mtime, reverse=True)
    
    print(f"INFO: Found {len(results_files['all'])} total results files")
    for model_type, files in results_files.items():
        if model_type != 'all' and files:
            print(f"INFO:   {model_type}: {len(files)} files")
    
    return results_files

def discover_analysis_files() -> Dict[str, List[Path]]:
    """Discover all analysis files organized by model type."""
    print("INFO: Discovering analysis files")
    
    analysis_files = {
        'pixtral': [],
        'llama': [],
        'doctr': [],
        'all': []
    }
    
    # Get all analysis JSON files
    all_files = list(ANALYSIS_DIR.glob("analysis-*.json"))
    
    for file in all_files:
        analysis_files['all'].append(file)
        
        # Categorize by model type based on filename pattern
        if 'pixtral' in file.name:
            analysis_files['pixtral'].append(file)
        elif 'llama' in file.name:
            analysis_files['llama'].append(file)
        elif 'doctr' in file.name:
            analysis_files['doctr'].append(file)
    
    # Sort files by modification time (newest first)
    for model_type in analysis_files:
        analysis_files[model_type].sort(key=lambda x: x.stat().st_mtime, reverse=True)
    
    print(f"INFO: Found {len(analysis_files['all'])} total analysis files")
    for model_type, files in analysis_files.items():
        if model_type != 'all' and files:
            print(f"INFO:   {model_type}: {len(files)} files")
    
    return analysis_files

def load_results_file(file_path: Path) -> Dict[str, Any]:
    """Load and validate a results JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Validate structure
        required_keys = {'metadata', 'results'}
        missing_keys = required_keys - set(data.keys())
        if missing_keys:
            raise ValueError(f"Missing required keys in results file: {missing_keys}")
        
        # Add file metadata
        data['file_info'] = {
            'filename': file_path.name,
            'file_path': str(file_path),
            'file_size_mb': round(file_path.stat().st_size / (1024*1024), 2),
            'modification_time': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat()
        }
        
        return data
        
    except Exception as e:
        print(f"ERROR: Error loading results file {file_path}: {e}")
        raise

def load_analysis_file(file_path: Path) -> Dict[str, Any]:
    """Load and validate an analysis JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Validate structure
        required_keys = {'metadata', 'summary', 'extracted_data'}
        missing_keys = required_keys - set(data.keys())
        if missing_keys:
            raise ValueError(f"Missing required keys in analysis file: {missing_keys}")
        
        # Add file metadata
        data['file_info'] = {
            'filename': file_path.name,
            'file_path': str(file_path),
            'file_size_mb': round(file_path.stat().st_size / (1024*1024), 2),
            'modification_time': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat()
        }
        
        return data
        
    except Exception as e:
        print(f"ERROR: Error loading analysis file {file_path}: {e}")
        raise

def load_all_results(model_types: List[str] = None) -> Dict[str, List[Dict]]:
    """Load all results files for specified model types."""
    print("INFO: Loading all results files")
    
    if model_types is None:
        model_types = ['pixtral', 'llama', 'doctr']
    
    results_files = discover_results_files()
    all_results = {}
    
    for model_type in model_types:
        if model_type in results_files:
            all_results[model_type] = []
            for file_path in results_files[model_type]:
                try:
                    result_data = load_results_file(file_path)
                    all_results[model_type].append(result_data)
                except Exception as e:
                    print(f"WARNING: Skipping corrupted results file {file_path}: {e}")
    
    total_loaded = sum(len(results) for results in all_results.values())
    print(f"INFO: Loaded {total_loaded} results files across {len(all_results)} model types")
    
    return all_results

def load_all_analysis(model_types: List[str] = None) -> Dict[str, List[Dict]]:
    """Load all analysis files for specified model types."""
    print("INFO: Loading all analysis files")
    
    if model_types is None:
        model_types = ['pixtral', 'llama', 'doctr']
    
    analysis_files = discover_analysis_files()
    all_analysis = {}
    
    for model_type in model_types:
        if model_type in analysis_files:
            all_analysis[model_type] = []
            for file_path in analysis_files[model_type]:
                try:
                    analysis_data = load_analysis_file(file_path)
                    all_analysis[model_type].append(analysis_data)
                except Exception as e:
                    print(f"WARNING: Skipping corrupted analysis file {file_path}: {e}")
    
    total_loaded = sum(len(analyses) for analyses in all_analysis.values())
    print(f"INFO: Loaded {total_loaded} analysis files across {len(all_analysis)} model types")
    
    return all_analysis

def select_files_interactive(file_type: str = "results") -> List[Path]:
    """Interactive file selection for analysis."""
    if file_type == "results":
        files_dict = discover_results_files()
        title = "Available Results Files"
    elif file_type == "analysis":
        files_dict = discover_analysis_files()
        title = "Available Analysis Files"
    else:
        raise ValueError("file_type must be 'results' or 'analysis'")
    
    all_files = files_dict['all']
    if not all_files:
        print(f"No {file_type} files found.")
        return []
    
    print(f"\n{title}:")
    print("-" * 50)
    for i, file_path in enumerate(all_files, 1):
        # Extract model info from filename
        model_info = ""
        if 'pixtral' in file_path.name:
            model_info = " [Pixtral]"
        elif 'llama' in file_path.name:
            model_info = " [Llama]"
        elif 'doctr' in file_path.name:
            model_info = " [DocTR]"
        
        # Get file modification time
        mod_time = datetime.fromtimestamp(file_path.stat().st_mtime)
        print(f"{i:2d}. {file_path.name}{model_info}")
        print(f"     Modified: {mod_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    print(f"\n{len(all_files) + 1}. Load all files")
    
    while True:
        try:
            choice = input(f"\nSelect files (comma-separated numbers, or {len(all_files) + 1} for all): ")
            
            if choice.strip() == str(len(all_files) + 1):
                return all_files
            
            # Parse comma-separated choices
            choices = [int(x.strip()) for x in choice.split(',')]
            selected_files = []
            
            for choice_num in choices:
                if 1 <= choice_num <= len(all_files):
                    selected_files.append(all_files[choice_num - 1])
                else:
                    print(f"Invalid choice: {choice_num}")
                    continue
            
            if selected_files:
                print(f"\nSelected {len(selected_files)} file(s):")
                for file_path in selected_files:
                    print(f"  - {file_path.name}")
                return selected_files
            else:
                print("No valid files selected.")
                
        except ValueError:
            print("Please enter valid numbers separated by commas.")

def create_comprehensive_dataset() -> Dict[str, Any]:
    """Create a comprehensive dataset combining all available data."""
    print("INFO: Creating comprehensive dataset")
    
    # Load ground truth
    ground_truth = load_ground_truth_data()
    
    # Load all analysis files (which contain the processed results)
    all_analysis = load_all_analysis()
    
    # Create comprehensive dataset structure
    dataset = {
        'ground_truth': ground_truth,
        'model_data': {},
        'metadata': {
            'created_timestamp': datetime.now().isoformat(),
            'total_models': 0,
            'total_experiments': 0,
            'data_sources': {
                'ground_truth_file': str(DATA_DIR / "images" / "metadata" / "ground_truth.csv"),
                'results_directory': str(RESULTS_DIR),
                'analysis_directory': str(ANALYSIS_DIR)
            }
        }
    }
    
    total_experiments = 0
    for model_type, analyses in all_analysis.items():
        if analyses:
            dataset['model_data'][model_type] = analyses
            total_experiments += len(analyses)
            print(f"INFO: Added {len(analyses)} experiments for {model_type}")
    
    dataset['metadata']['total_models'] = len(dataset['model_data'])
    dataset['metadata']['total_experiments'] = total_experiments
    
    print(f"INFO: Comprehensive dataset created with {dataset['metadata']['total_models']} models and {total_experiments} experiments")
    
    return dataset

# Initialize data loading and create comprehensive dataset
print("ℹ️  Initializing data loading functions")

# Verify data directories exist
required_dirs = [RESULTS_DIR, ANALYSIS_DIR, DATA_DIR / "images" / "metadata"]
for dir_path in required_dirs:
    if not dir_path.exists():
        print(f"WARNING: Creating missing directory: {dir_path}")
        dir_path.mkdir(parents=True, exist_ok=True)

# Discover available data files
available_results = discover_results_files()
available_analysis = discover_analysis_files()

# Load ground truth data
try:
    GROUND_TRUTH_DATA = load_ground_truth_data()
    print(f"✅ Ground truth loaded: {len(GROUND_TRUTH_DATA)} records")
except Exception as e:
    print(f"WARNING: Could not load ground truth data: {e}")
    GROUND_TRUTH_DATA = None

# Create comprehensive dataset for analysis
try:
    COMPREHENSIVE_DATASET = create_comprehensive_dataset()
    print("✅ Comprehensive dataset created successfully")
except Exception as e:
    print(f"WARNING: Could not create comprehensive dataset: {e}")
    COMPREHENSIVE_DATASET = None

# Display summary of available data
print("\n📊 Data Loading Summary:")
print(f"   • Ground truth records: {len(GROUND_TRUTH_DATA) if GROUND_TRUTH_DATA is not None else 'Not available'}")
print(f"   • Results files found: {len(available_results['all'])}")
print(f"   • Analysis files found: {len(available_analysis['all'])}")

if available_results['all']:
    print("\n   Results by model type:")
    for model_type, files in available_results.items():
        if model_type != 'all' and files:
            print(f"     - {model_type.title()}: {len(files)} files")

if available_analysis['all']:
    print("\n   Analysis by model type:")
    for model_type, files in available_analysis.items():
        if model_type != 'all' and files:
            print(f"     - {model_type.title()}: {len(files)} files")

print("\n✅ Data loading functions ready for analysis")

ℹ️  Initializing data loading functions
INFO: Discovering results files
INFO: Found 15 total results files
INFO:   pixtral: 4 files
INFO:   llama: 4 files
INFO:   doctr: 7 files
INFO: Discovering analysis files
INFO: Found 15 total analysis files
INFO:   pixtral: 4 files
INFO:   llama: 4 files
INFO:   doctr: 7 files
INFO: Loaded ground truth data: 549 records
✅ Ground truth loaded: 549 records
INFO: Creating comprehensive dataset
INFO: Loaded ground truth data: 549 records
INFO: Loading all analysis files
INFO: Discovering analysis files
INFO: Found 15 total analysis files
INFO:   pixtral: 4 files
INFO:   llama: 4 files
INFO:   doctr: 7 files
INFO: Loaded 15 analysis files across 3 model types
INFO: Added 4 experiments for pixtral
INFO: Added 4 experiments for llama
INFO: Added 7 experiments for doctr
INFO: Comprehensive dataset created with 3 models and 15 experiments
✅ Comprehensive dataset created successfully

📊 Data Loading Summary:
   • Ground truth records: 549
   • Results file

## Section 1: Executive Summary


### Experimental Design & Controlled Variables

*Placeholder for discussion of controlled experimental design, image quality control, content standardization, and design rationale.*


### Cell 1.1: Project Context & Key Findings


In [6]:
# Create Primary Performance Comparison Bar Chart
# Side-by-side comparison of total accuracy for all LMM trials vs all OCR trials
# Roll up across all prompts and queries


In [7]:
# Create Model Type Breakdown Bar Chart
# Break down into model types within each category
# (LMM-Pixtral, LMM-Llama, OCR with all 7 recognition models)
# Group by category and order by performance


In [8]:
# Create Complete Model Performance Bar Chart
# All models organized by performance, color coded by category (LMM vs OCR only)
# Include 85% accuracy reference line for industry automation standards


*Placeholder for key findings discussion and business case establishment.*


## Section 2: Cross-Model Performance Comparison


### Cell 2.1: Comprehensive Model Performance Analysis


In [9]:
# Create LMM Models vs Prompts Heatmap (Accuracy)
# Pixtral/Llama (rows) × Prompt types (columns) with accuracy values


In [10]:
# Create LMM Models vs Prompts Heatmap (CER)
# Pixtral/Llama (rows) × Prompt types (columns) with CER values


In [11]:
# Create LMM Prompts vs Query Heatmap (Accuracy)
# Prompt types (rows) × Query types (Work Order/Total Cost) with accuracy values


In [12]:
# Create LMM Prompts vs Query Heatmap (CER)
# Prompt types (rows) × Query types (Work Order/Total Cost) with CER values


In [13]:
# Create All Models vs Query Heatmap (Accuracy)
# All models including OCR (rows) × Query types (columns) with accuracy values


In [14]:
# Create All Models vs Query Heatmap (CER)
# All models including OCR (rows) × Query types (columns) with CER values


*Placeholder for analysis of LMM model responses to different prompt strategies, optimal prompt-model combinations, and CER pattern relationships.*


### Cell 2.2: Model Consistency Analysis


In [15]:
# Create Coefficient of Variation Bar Chart
# Performance stability across prompts for each model


In [16]:
# Create Min-Max Range Visualization
# Performance ranges to identify most/least consistent models


*Placeholder for evaluation of performance stability across different conditions.*


## Section 3: Error Pattern Taxonomy & System Improvement Insights


### Cell 3.1: Systematic Error Analysis


In [17]:
# Create Error Pattern Examples visualization
# Visual examples of each error category with actual vs. expected results


In [18]:
# Create Post-Processing Opportunity Assessment
# Estimate potential accuracy improvements for each error type


*Placeholder for identification of patterns that could be addressed through post-processing.*


### Cell 3.2: Error Classification System


In [19]:
# Create Error Type Distribution Pie Charts
# Separate charts for Work Order vs. Total Cost errors


In [20]:
# Create Error Frequency Heatmap
# Error types (rows) × Models (columns)


*Placeholder for categorization and quantification of different types of failures.*


### Cell 3.3: Failure Mode Deep Dive


In [21]:
# Create Failure Severity Distribution
# Histogram of error magnitudes


In [22]:
# Create Model Robustness Comparison
# How models handle edge cases


*Placeholder for understanding catastrophic vs. graceful degradation patterns.*


## Section 4: Prompt Engineering Effectiveness Analysis


### Cell 4.1: Prompt Strategy Performance


In [23]:
# Create Prompt Performance Matrix
# Accuracy gains/losses by prompt type across models


In [24]:
# Create Prompt-Model Interaction Effects
# Line graphs showing how each model responds to different prompts


*Placeholder for quantifying effectiveness of different prompting approaches.*


## Section 5: Field-Specific Performance Deep Dive


### Cell 5.1: Work Order vs. Total Cost Performance Differential


In [25]:
# Create Field Performance Comparison
# Side-by-side accuracy for each field across all models


In [26]:
# Create Performance Gap Analysis
# Difference between Total Cost and Work Order accuracy by model


*Placeholder for understanding why models excel at one field but struggle with another.*


## Section 6: Character Error Rate (CER) Deep Analysis


### Cell 6.1: CER Distribution Analysis


In [27]:
# Create CER Distribution Histograms
# Separate for Work Order and Total Cost


In [28]:
# Create Model CER Comparison Box Plots
# Show ranges and outliers


*Placeholder for understanding the spread and clustering of character-level errors.*


## Section 7: Computational Efficiency Analysis


### Cell 7.1: Performance per Resource Unit


In [29]:
# Create Efficiency Frontier Plot
# Accuracy vs. computational cost scatter plot


In [30]:
# Create Cost-Benefit Analysis
# ROI calculations for different model choices


## Section 8: Statistical Overview & Significance Testing


### Cell 8.1: Statistical Summary


In [31]:
# Create Performance Distribution Box Plots
# Accuracy ranges across all model/prompt combinations


In [32]:
# Create Statistical Significance Matrix
# P-values for key comparisons


*Placeholder for high-level statistical summary of all results.*


## Section 9: Synthesis & Key Insights


### Cell 9.1: Model Selection Decision Matrix


In [33]:
# Create Multi-Criteria Decision Matrix
# Weighted scoring across accuracy, speed, cost


In [34]:
# Create Use Case Recommendations
# Different models for different deployment scenarios


*Placeholder for providing clear guidance for model choice based on different criteria.*


### Cell 9.2: System Improvement Roadmap


In [35]:
# Create Improvement Opportunity Matrix
# Effort vs. Impact for different enhancement areas


In [36]:
# Create Implementation Timeline
# Suggested sequence for system improvements


*Placeholder for prioritizing enhancement opportunities based on analysis findings.*


### Cell 9.3: Unexpected Findings & Future Research


*Create Unexpected Findings Highlight* 
*Key discoveries and their implications*


*Create Future Research Opportunities
*Areas identified for continued investigation


*Placeholder for highlighting discoveries not anticipated in initial research design.*
