In [6]:
"""
E+ Workflow Dependency Tracker and Validator

This script analyzes the E+ workflow, tracks dependencies between steps,
and validates that inputs are properly generated and ready to use.
"""

import os
import json
import logging
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Set
import pandas as pd
import glob

class WorkflowStep:
    """Represents a single step in the workflow"""
    def __init__(self, name: str, enabled_flag: str, config_section: str):
        self.name = name
        self.enabled_flag = enabled_flag
        self.config_section = config_section
        self.inputs = []
        self.outputs = []
        self.dependencies = []
        self.validation_rules = []

class WorkflowTracker:
    """Tracks and validates the E+ workflow execution"""
    
    def __init__(self, job_output_dir: str):
        self.job_output_dir = Path(job_output_dir)
        self.job_id = self.job_output_dir.name
        self.base_dir = self.job_output_dir.parent.parent
        
        # Setup logging
        self.setup_logging()
        
        # Define workflow steps with their dependencies
        self.workflow_steps = self._define_workflow_steps()
        
        # Track execution status
        self.execution_status = {}
        
    def setup_logging(self):
        """Setup logging configuration"""
        log_file = self.job_output_dir / "workflow_tracker.log"
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
    def _define_workflow_steps(self) -> Dict[str, WorkflowStep]:
        """Define all workflow steps and their dependencies"""
        steps = {}
        
        # 1. IDF Creation Step
        idf_step = WorkflowStep(
            name="IDF Creation",
            enabled_flag="perform_idf_creation",
            config_section="idf_creation"
        )
        idf_step.inputs = [
            ("config", "user_configs_folder"),
            ("data", "idf_templates"),
            ("data", "weather_files")
        ]
        idf_step.outputs = [
            ("output_IDFs", "*.idf"),
            ("idf_tracker.json", None)
        ]
        steps["idf_creation"] = idf_step
        
        # 2. Simulation Step
        sim_step = WorkflowStep(
            name="Simulation",
            enabled_flag="run_simulations",
            config_section="idf_creation"
        )
        sim_step.inputs = [
            ("output_IDFs", "*.idf"),
            ("weather", "*.epw")
        ]
        sim_step.outputs = [
            ("Sim_Results", "*.sql"),
            ("Sim_Results", "*.htm"),
            ("Sim_Results", "*.csv")
        ]
        sim_step.dependencies = ["idf_creation"]
        steps["simulation"] = sim_step
        
        # 3. Parsing Step
        parse_step = WorkflowStep(
            name="Parsing",
            enabled_flag="perform_parsing",
            config_section="parsing"
        )
        parse_step.inputs = [
            ("Sim_Results", "*.sql"),
            ("output_IDFs", "*.idf")
        ]
        parse_step.outputs = [
            ("parsed_data", "*.parquet"),
            ("parsed_data/idf_data", "*.parquet"),
            ("parsed_data/output_data", "*.parquet")
        ]
        parse_step.dependencies = ["simulation"]
        steps["parsing"] = parse_step
        
        # 4. Modification Step
        mod_step = WorkflowStep(
            name="Modification",
            enabled_flag="perform_modification",
            config_section="modification"
        )
        mod_step.inputs = [
            ("output_IDFs", "*.idf"),
            ("parsed_data", "*.parquet")
        ]
        mod_step.outputs = [
            ("modified_idfs", "*.idf"),
            ("modified_idfs/modification_summary.json", None),
            ("modified_idfs/*/modification_report.json", None)
        ]
        mod_step.dependencies = ["parsing"]
        steps["modification"] = mod_step
        
        # 5. Modified Simulation Step
        mod_sim_step = WorkflowStep(
            name="Modified Simulation",
            enabled_flag="run_modified_simulations",
            config_section="modification.post_modification"
        )
        mod_sim_step.inputs = [
            ("modified_idfs", "*.idf")
        ]
        mod_sim_step.outputs = [
            ("Modified_Sim_Results", "*.sql"),
            ("Modified_Sim_Results", "*.csv")
        ]
        mod_sim_step.dependencies = ["modification"]
        steps["modified_simulation"] = mod_sim_step
        
        # 6. Modified Parsing Step
        mod_parse_step = WorkflowStep(
            name="Modified Parsing",
            enabled_flag="parse_modified_results",
            config_section="modification.post_modification"
        )
        mod_parse_step.inputs = [
            ("Modified_Sim_Results", "*.sql"),
            ("modified_idfs", "*.idf")
        ]
        mod_parse_step.outputs = [
            ("parsed_modified_results", "*.parquet")
        ]
        mod_parse_step.dependencies = ["modified_simulation"]
        steps["modified_parsing"] = mod_parse_step
        
        # 7. Validation Step
        val_step = WorkflowStep(
            name="Validation",
            enabled_flag="perform_validation",
            config_section="validation"
        )
        val_step.inputs = [
            ("parsed_data", "*.parquet"),
            ("parsed_modified_results", "*.parquet"),
            ("measured_data.csv", None)
        ]
        val_step.outputs = [
            ("validation_results", "*.json"),
            ("validation_results", "*.html")
        ]
        val_step.dependencies = ["parsing", "modified_parsing"]
        steps["validation"] = val_step
        
        # 8. Sensitivity Analysis Step
        sens_step = WorkflowStep(
            name="Sensitivity Analysis",
            enabled_flag="perform_sensitivity",
            config_section="sensitivity"
        )
        sens_step.inputs = [
            ("parsed_data", "*.parquet"),
            ("parsed_modified_results", "*.parquet")
        ]
        sens_step.outputs = [
            ("sensitivity_results", "*.parquet"),
            ("sensitivity_results", "*.json"),
            ("sensitivity_results", "*.html")
        ]
        sens_step.dependencies = ["parsing", "modified_parsing"]
        steps["sensitivity"] = sens_step
        
        # 9. Surrogate Modeling Step
        surr_step = WorkflowStep(
            name="Surrogate Modeling",
            enabled_flag="perform_surrogate",
            config_section="surrogate"
        )
        surr_step.inputs = [
            ("parsed_data", "*.parquet"),
            ("parsed_modified_results", "*.parquet"),
            ("sensitivity_results", "*.parquet")
        ]
        surr_step.outputs = [
            ("surrogate_models", "*.pkl"),
            ("surrogate_models", "*.json"),
            ("surrogate_models/validation_report.json", None)
        ]
        surr_step.dependencies = ["sensitivity"]
        steps["surrogate"] = surr_step
        
        # 10. Calibration Step
        cal_step = WorkflowStep(
            name="Calibration",
            enabled_flag="perform_calibration",
            config_section="calibration"
        )
        cal_step.inputs = [
            ("surrogate_models", "*.pkl"),
            ("validation_results", "*.json")
        ]
        cal_step.outputs = [
            ("calibration_results", "*.json"),
            ("calibration_results", "*.parquet")
        ]
        cal_step.dependencies = ["surrogate", "validation"]
        steps["calibration"] = cal_step
        
        return steps
    
    def check_file_exists(self, relative_path: str, pattern: Optional[str] = None) -> Tuple[bool, List[str]]:
        """Check if file(s) exist in the job output directory"""
        full_path = self.job_output_dir / relative_path
        
        if pattern:
            # Check for pattern match
            search_path = full_path / pattern if full_path.is_dir() else full_path.parent / pattern
            matches = glob.glob(str(search_path))
            return len(matches) > 0, matches
        else:
            # Check for specific file
            return full_path.exists(), [str(full_path)] if full_path.exists() else []
    
    def validate_step_inputs(self, step: WorkflowStep) -> Dict[str, bool]:
        """Validate all inputs for a workflow step"""
        input_status = {}
        
        for input_path, pattern in step.inputs:
            exists, files = self.check_file_exists(input_path, pattern)
            input_key = f"{input_path}/{pattern}" if pattern else input_path
            input_status[input_key] = {
                "exists": exists,
                "files": files,
                "count": len(files)
            }
            
        return input_status
    
    def validate_step_outputs(self, step: WorkflowStep) -> Dict[str, bool]:
        """Validate all outputs for a workflow step"""
        output_status = {}
        
        for output_path, pattern in step.outputs:
            exists, files = self.check_file_exists(output_path, pattern)
            output_key = f"{output_path}/{pattern}" if pattern else output_path
            output_status[output_key] = {
                "exists": exists,
                "files": files,
                "count": len(files)
            }
            
        return output_status
    
    def check_dependencies_met(self, step_name: str) -> Tuple[bool, List[str]]:
        """Check if all dependencies for a step are met"""
        step = self.workflow_steps.get(step_name)
        if not step:
            return False, [f"Step {step_name} not found"]
        
        missing_deps = []
        for dep in step.dependencies:
            dep_step = self.workflow_steps.get(dep)
            if dep_step:
                # Check if dependency step has completed (all outputs exist)
                output_status = self.validate_step_outputs(dep_step)
                for output, status in output_status.items():
                    if not status["exists"]:
                        missing_deps.append(f"{dep}: {output}")
        
        return len(missing_deps) == 0, missing_deps
    
    def analyze_workflow(self) -> Dict[str, Any]:
        """Analyze the entire workflow and return comprehensive status"""
        analysis = {
            "job_id": self.job_id,
            "job_output_dir": str(self.job_output_dir),
            "timestamp": datetime.now().isoformat(),
            "steps": {}
        }
        
        # Check configuration file
        config_files = list(self.job_output_dir.glob("combined*.json"))
        if config_files:
            with open(config_files[0], 'r') as f:
                config = json.load(f)
                main_config = config.get("main_config", {})
        else:
            main_config = {}
            self.logger.warning("No configuration file found")
        
        # Analyze each step
        for step_name, step in self.workflow_steps.items():
            self.logger.info(f"\n{'='*60}")
            self.logger.info(f"Analyzing step: {step.name}")
            self.logger.info(f"{'='*60}")
            
            # Check if step is enabled
            config_section = main_config
            for section in step.config_section.split('.'):
                config_section = config_section.get(section, {})
            
            enabled = config_section.get(step.enabled_flag, False)
            
            # Check dependencies
            deps_met, missing_deps = self.check_dependencies_met(step_name)
            
            # Validate inputs and outputs
            input_status = self.validate_step_inputs(step)
            output_status = self.validate_step_outputs(step)
            
            # Determine step status
            inputs_ready = all(status["exists"] for status in input_status.values())
            outputs_ready = all(status["exists"] for status in output_status.values())
            
            if not enabled:
                status = "DISABLED"
            elif not deps_met:
                status = "BLOCKED"
            elif not inputs_ready:
                status = "MISSING_INPUTS"
            elif outputs_ready:
                status = "COMPLETED"
            else:
                status = "READY"
            
            step_analysis = {
                "name": step.name,
                "enabled": enabled,
                "status": status,
                "dependencies_met": deps_met,
                "missing_dependencies": missing_deps,
                "inputs": input_status,
                "outputs": output_status,
                "input_summary": {
                    "total": len(input_status),
                    "ready": sum(1 for s in input_status.values() if s["exists"]),
                    "missing": sum(1 for s in input_status.values() if not s["exists"])
                },
                "output_summary": {
                    "total": len(output_status),
                    "ready": sum(1 for s in output_status.values() if s["exists"]),
                    "missing": sum(1 for s in output_status.values() if not s["exists"])
                }
            }
            
            analysis["steps"][step_name] = step_analysis
            
            # Log summary
            self.logger.info(f"Status: {status}")
            self.logger.info(f"Enabled: {enabled}")
            self.logger.info(f"Dependencies met: {deps_met}")
            self.logger.info(f"Inputs ready: {step_analysis['input_summary']['ready']}/{step_analysis['input_summary']['total']}")
            self.logger.info(f"Outputs ready: {step_analysis['output_summary']['ready']}/{step_analysis['output_summary']['total']}")
            
            if missing_deps:
                self.logger.warning(f"Missing dependencies: {missing_deps}")
            
            # Log missing inputs
            for input_name, status in input_status.items():
                if not status["exists"]:
                    self.logger.warning(f"Missing input: {input_name}")
            
            # Log generated outputs
            for output_name, status in output_status.items():
                if status["exists"]:
                    self.logger.info(f"Generated output: {output_name} ({status['count']} files)")
        
        return analysis
    
    def generate_workflow_report(self, analysis: Dict[str, Any]) -> str:
        """Generate a detailed HTML report of the workflow status"""
        html = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>E+ Workflow Analysis Report</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 20px; }}
                h1, h2, h3 {{ color: #333; }}
                .metadata {{ background: #f0f0f0; padding: 10px; border-radius: 5px; margin-bottom: 20px; }}
                .step {{ border: 1px solid #ddd; margin: 10px 0; padding: 15px; border-radius: 5px; }}
                .status-COMPLETED {{ background: #d4f4dd; }}
                .status-READY {{ background: #fff3cd; }}
                .status-BLOCKED {{ background: #f8d7da; }}
                .status-DISABLED {{ background: #e7e7e7; }}
                .status-MISSING_INPUTS {{ background: #ffeaa7; }}
                .summary {{ display: flex; gap: 20px; margin: 10px 0; }}
                .summary-box {{ padding: 10px; background: #f9f9f9; border-radius: 3px; }}
                .file-list {{ font-size: 0.9em; color: #666; margin-left: 20px; }}
                .missing {{ color: #d32f2f; }}
                .exists {{ color: #388e3c; }}
                table {{ border-collapse: collapse; width: 100%; margin: 10px 0; }}
                th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
                th {{ background: #f2f2f2; }}
                .dependency-graph {{ margin: 20px 0; padding: 20px; background: #fafafa; border-radius: 5px; }}
            </style>
        </head>
        <body>
            <h1>E+ Workflow Analysis Report</h1>
            
            <div class="metadata">
                <strong>Job ID:</strong> {analysis['job_id']}<br>
                <strong>Output Directory:</strong> {analysis['job_output_dir']}<br>
                <strong>Analysis Time:</strong> {analysis['timestamp']}
            </div>
            
            <h2>Workflow Summary</h2>
            <table>
                <tr>
                    <th>Step</th>
                    <th>Status</th>
                    <th>Enabled</th>
                    <th>Dependencies Met</th>
                    <th>Inputs Ready</th>
                    <th>Outputs Generated</th>
                </tr>
        """
        
        for step_name, step_data in analysis['steps'].items():
            html += f"""
                <tr class="status-{step_data['status']}">
                    <td>{step_data['name']}</td>
                    <td><strong>{step_data['status']}</strong></td>
                    <td>{'✓' if step_data['enabled'] else '✗'}</td>
                    <td>{'✓' if step_data['dependencies_met'] else '✗'}</td>
                    <td>{step_data['input_summary']['ready']}/{step_data['input_summary']['total']}</td>
                    <td>{step_data['output_summary']['ready']}/{step_data['output_summary']['total']}</td>
                </tr>
            """
        
        html += """
            </table>
            
            <h2>Detailed Step Analysis</h2>
        """
        
        for step_name, step_data in analysis['steps'].items():
            html += f"""
            <div class="step status-{step_data['status']}">
                <h3>{step_data['name']}</h3>
                <div class="summary">
                    <div class="summary-box">
                        <strong>Status:</strong> {step_data['status']}
                    </div>
                    <div class="summary-box">
                        <strong>Enabled:</strong> {'Yes' if step_data['enabled'] else 'No'}
                    </div>
                    <div class="summary-box">
                        <strong>Dependencies:</strong> {'Met' if step_data['dependencies_met'] else 'Not Met'}
                    </div>
                </div>
            """
            
            if step_data['missing_dependencies']:
                html += f"""
                <div class="missing">
                    <strong>Missing Dependencies:</strong>
                    <ul>
                        {''.join(f'<li>{dep}</li>' for dep in step_data['missing_dependencies'])}
                    </ul>
                </div>
                """
            
            # Inputs section
            html += "<h4>Inputs:</h4><ul>"
            for input_name, status in step_data['inputs'].items():
                class_name = "exists" if status['exists'] else "missing"
                html += f"""
                <li class="{class_name}">
                    {input_name}: {'✓' if status['exists'] else '✗'} ({status['count']} files)
                """
                if status['exists'] and status['count'] <= 5:
                    html += "<div class='file-list'>"
                    for file in status['files']:
                        html += f"{os.path.basename(file)}<br>"
                    html += "</div>"
                html += "</li>"
            html += "</ul>"
            
            # Outputs section
            html += "<h4>Outputs:</h4><ul>"
            for output_name, status in step_data['outputs'].items():
                class_name = "exists" if status['exists'] else "missing"
                html += f"""
                <li class="{class_name}">
                    {output_name}: {'✓' if status['exists'] else '✗'} ({status['count']} files)
                """
                if status['exists'] and status['count'] <= 5:
                    html += "<div class='file-list'>"
                    for file in status['files']:
                        html += f"{os.path.basename(file)}<br>"
                    html += "</div>"
                html += "</li>"
            html += "</ul>"
            
            html += "</div>"
        
        html += """
            <h2>Workflow Dependency Graph</h2>
            <div class="dependency-graph">
                <pre>
IDF Creation
    ↓
Simulation
    ↓
Parsing ─────────────┐
    ↓                 ↓
Modification     Validation
    ↓                 ↓
Modified Simulation   ↓
    ↓                 ↓
Modified Parsing ─────┤
    ↓                 ↓
Sensitivity Analysis  ↓
    ↓                 ↓
Surrogate Modeling ───┤
    ↓                 ↓
Calibration ←─────────┘
                </pre>
            </div>
            
        </body>
        </html>
        """
        
        return html
    
    def validate_data_quality(self) -> Dict[str, Any]:
        """Validate the quality of data at each step"""
        quality_report = {
            "timestamp": datetime.now().isoformat(),
            "checks": {}
        }
        
        # Check parsed data quality
        parsed_data_dir = self.job_output_dir / "parsed_data"
        if parsed_data_dir.exists():
            quality_report["checks"]["parsed_data"] = self._check_parquet_files(parsed_data_dir)
        
        # Check modified results quality
        modified_data_dir = self.job_output_dir / "parsed_modified_results"
        if modified_data_dir.exists():
            quality_report["checks"]["modified_data"] = self._check_parquet_files(modified_data_dir)
        
        # Check sensitivity results
        sens_dir = self.job_output_dir / "sensitivity_results"
        if sens_dir.exists():
            sens_files = list(sens_dir.glob("*.parquet"))
            if sens_files:
                quality_report["checks"]["sensitivity"] = {
                    "file_count": len(sens_files),
                    "files": [f.name for f in sens_files]
                }
        
        return quality_report
    
    def _check_parquet_files(self, directory: Path) -> Dict[str, Any]:
        """Check quality of parquet files in a directory"""
        report = {
            "total_files": 0,
            "total_rows": 0,
            "files": {}
        }
        
        for parquet_file in directory.rglob("*.parquet"):
            try:
                df = pd.read_parquet(parquet_file)
                report["files"][str(parquet_file.relative_to(directory))] = {
                    "rows": len(df),
                    "columns": list(df.columns),
                    "missing_values": df.isnull().sum().to_dict()
                }
                report["total_files"] += 1
                report["total_rows"] += len(df)
            except Exception as e:
                report["files"][str(parquet_file.relative_to(directory))] = {
                    "error": str(e)
                }
        
        return report
    
    def run_full_analysis(self):
        """Run complete workflow analysis and generate reports"""
        self.logger.info("="*80)
        self.logger.info("E+ WORKFLOW DEPENDENCY TRACKER AND VALIDATOR")
        self.logger.info("="*80)
        
        # Run workflow analysis
        analysis = self.analyze_workflow()
        
        # Save analysis as JSON
        analysis_file = self.job_output_dir / "workflow_analysis.json"
        with open(analysis_file, 'w') as f:
            json.dump(analysis, f, indent=2)
        self.logger.info(f"\nSaved analysis to: {analysis_file}")
        
        # Generate HTML report
        html_report = self.generate_workflow_report(analysis)
        report_file = self.job_output_dir / "workflow_analysis_report.html"
        with open(report_file, 'w') as f:
            f.write(html_report)
        self.logger.info(f"Generated HTML report: {report_file}")
        
        # Run data quality checks
        quality_report = self.validate_data_quality()
        quality_file = self.job_output_dir / "data_quality_report.json"
        with open(quality_file, 'w') as f:
            json.dump(quality_report, f, indent=2)
        self.logger.info(f"Generated data quality report: {quality_file}")
        
        # Print summary
        self.logger.info("\n" + "="*80)
        self.logger.info("WORKFLOW SUMMARY")
        self.logger.info("="*80)
        
        status_counts = {}
        for step_data in analysis['steps'].values():
            status = step_data['status']
            status_counts[status] = status_counts.get(status, 0) + 1
        
        for status, count in status_counts.items():
            self.logger.info(f"{status}: {count} steps")
        
        # Identify next actionable steps
        self.logger.info("\n" + "="*80)
        self.logger.info("NEXT ACTIONABLE STEPS")
        self.logger.info("="*80)
        
        for step_name, step_data in analysis['steps'].items():
            if step_data['status'] == 'READY':
                self.logger.info(f"- {step_data['name']} is ready to run")
            elif step_data['status'] == 'MISSING_INPUTS':
                self.logger.info(f"- {step_data['name']} is missing inputs:")
                for input_name, status in step_data['inputs'].items():
                    if not status['exists']:
                        self.logger.info(f"  * {input_name}")


def main():
    """Main entry point for the workflow tracker"""
    import sys
    
    if len(sys.argv) < 2:
        print("Usage: python workflow_tracker.py <job_output_directory>")
        print("\nExample paths from your last run:")
        print("D:\Documents\daily\E_Plus_2040_py\output\5f3924b1-189c-4c95-bf33-52d0602d79d9")
        sys.exit(1)
    
    job_output_dir = sys.argv[1]
    
    if not os.path.exists(job_output_dir):
        print(f"Error: Directory not found: {job_output_dir}")
        sys.exit(1)
    
    # Create and run tracker
    tracker = WorkflowTracker(job_output_dir)
    tracker.run_full_analysis()
    
    print(f"\nAnalysis complete! Check the following files in {job_output_dir}:")
    print("  - workflow_analysis.json: Detailed analysis data")
    print("  - workflow_analysis_report.html: Visual HTML report")
    print("  - data_quality_report.json: Data quality checks")
    print("  - workflow_tracker.log: Detailed execution log")


if __name__ == "__main__":
    main()

Error: Directory not found: --f=c:\Users\aminj\AppData\Roaming\jupyter\runtime\kernel-v35fb68adb961166f7b060bf192bf7b9f4e0a470d4.json


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Surrogate Checker

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
import json
from datetime import datetime

class SurrogateDataChecker:
    def __init__(self, job_output_dir: str):
        self.job_output_dir = Path(job_output_dir)
        self.results = []
        self.discovered_columns = {}  # Store discovered extra columns
        
        # Define expected structure with columns and types
        self.expected_structure = {
            # IDF Data
            'parsed_data/idf_data/by_category/lighting.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'LightingLevel': 'float',
                    'FractionRadiant': 'float',
                    'FractionVisible': 'float',
                    'ScheduleName': 'str'
                }
            },
            'parsed_data/idf_data/by_category/hvac_equipment.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'object_type': 'str',
                    'MaximumHeatingSupplyAirTemperature': 'float',
                    'MinimumCoolingSupplyAirTemperature': 'float'
                }
            },
            'parsed_data/idf_data/by_category/materials_materials.parquet': {
                'columns': {
                    'building_id': 'str',
                    'material_name': 'str',
                    'material_type': 'str',
                    'Thickness': 'float',
                    'Conductivity': 'float',
                    'Density': 'float',
                    'SpecificHeat': 'float'
                }
            },
            'parsed_data/idf_data/by_category/infiltration.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'DesignFlowRateCalculationMethod': 'str',
                    'DesignFlowRate': 'float',
                    'FlowperZoneFloorArea': 'float'
                }
            },
            'parsed_data/idf_data/by_category/ventilation.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'OutdoorAirMethod': 'str',
                    'OutdoorAirFlowperPerson': 'float'
                }
            },
            # SQL Results
            'parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                }
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/hvac_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            # Relationships
            'parsed_data/relationships/zone_mappings.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'sql_zone_name': 'str',
                    'zone_index': 'int',
                    'floor_area': 'float',
                    'volume': 'float'
                }
            },
            'parsed_data/metadata/building_registry.parquet': {
                'columns': {
                    'building_id': 'str',
                    'building_type': 'str',
                    'total_floor_area': 'float',
                    'num_zones': 'int'
                }
            },
            # Modified results
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/zones_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            # Modifications
            'modified_idfs/modifications_detail_*.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'variant_id': 'str',
                    'category': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'field_name': 'str',
                    'original_value': ['str', 'float'],
                    'new_value': ['str', 'float'],
                    'relative_change': 'float'
                }
            },
            # Sensitivity
            'sensitivity_results/sensitivity_for_surrogate.parquet': {
                'columns': {
                    'parameter': 'str',
                    'sensitivity_score': 'float',
                    'elasticity': 'float',
                    'p_value': 'float'
                }
            }
        }
    
    def convert_to_native_types(self, obj: Any) -> Any:
        """Convert numpy types to native Python types for JSON serialization."""
        if isinstance(obj, dict):
            return {k: self.convert_to_native_types(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self.convert_to_native_types(item) for item in obj]
        elif isinstance(obj, tuple):
            return tuple(self.convert_to_native_types(item) for item in obj)
        elif isinstance(obj, pd.Timestamp):
            # Convert pandas Timestamp to ISO format string
            return obj.isoformat()
        elif isinstance(obj, datetime):
            # Convert datetime to ISO format string
            return obj.isoformat()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            if np.isnan(obj):
                return None
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif pd.isna(obj):
            return None
        else:
            return obj
    
    def check_file_exists(self, file_path: str) -> Tuple[bool, Optional[str]]:
        """Check if file exists and return its absolute path."""
        full_path = self.job_output_dir / file_path
        
        # Also check with wildcards for files like modifications_detail_*.parquet
        if '*' in file_path:
            parent = full_path.parent
            pattern = full_path.name
            if parent.exists():
                matching_files = list(parent.glob(pattern))
                if matching_files:
                    return True, str(matching_files[0])
            return False, None
        
        return full_path.exists(), str(full_path) if full_path.exists() else None
    
    def get_dtype_string(self, dtype) -> str:
        """Convert numpy/pandas dtype to simple string."""
        dtype_str = str(dtype)
        
        if 'int' in dtype_str:
            return 'int'
        elif 'float' in dtype_str:
            return 'float'
        elif 'object' in dtype_str or 'string' in dtype_str:
            return 'str'
        elif 'datetime' in dtype_str:
            return 'datetime'
        elif 'bool' in dtype_str:
            return 'bool'
        else:
            return dtype_str
    
    def get_sample_values(self, series: pd.Series, n_samples: int = 5) -> List:
        """Get sample values from a series."""
        unique_vals = series.dropna().unique()
        if len(unique_vals) <= n_samples:
            return [self.convert_to_native_types(val) for val in unique_vals]
        else:
            # Get a mix of values
            samples = []
            if len(unique_vals) > 0:
                # Add min/max for numeric
                if pd.api.types.is_numeric_dtype(series):
                    samples.append(self.convert_to_native_types(series.min()))
                    samples.append(self.convert_to_native_types(series.max()))
                    # Add some random samples
                    remaining = n_samples - 2
                    if remaining > 0:
                        random_samples = series.dropna().sample(n=min(remaining, len(series))).tolist()
                        samples.extend([self.convert_to_native_types(val) for val in random_samples[:remaining]])
                else:
                    # For non-numeric, just take first n_samples
                    samples = [self.convert_to_native_types(val) for val in unique_vals[:n_samples]]
            return samples
    
    def check_parquet_file(self, file_path: str, expected_columns: Dict[str, any]) -> Dict:
        """Check a parquet file's structure."""
        exists, abs_path = self.check_file_exists(file_path)
        
        result = {
            'file': file_path,
            'exists': exists,
            'path': abs_path,
            'row_count': 0,
            'columns': {},
            'extra_columns': {}  # Store discovered columns
        }
        
        if exists and abs_path:
            try:
                # Read parquet file
                df = pd.read_parquet(abs_path)
                result['row_count'] = len(df)
                
                # Get actual columns and types
                actual_columns = {}
                for col in df.columns:
                    actual_columns[col] = self.get_dtype_string(df[col].dtype)
                
                # Store discovered columns info for this file
                if file_path not in self.discovered_columns:
                    self.discovered_columns[file_path] = {}
                
                # Compare with expected
                for exp_col, exp_type in expected_columns.items():
                    if exp_col in actual_columns:
                        actual_type = actual_columns[exp_col]
                        
                        # Handle multiple expected types
                        if isinstance(exp_type, list):
                            type_match = actual_type in exp_type
                        else:
                            type_match = (actual_type == exp_type) or \
                                       (exp_type == 'str' and actual_type == 'object')
                        
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': actual_type,
                            'exists': True,
                            'type_match': type_match,
                            'null_count': int(df[exp_col].isnull().sum()),
                            'unique_count': int(df[exp_col].nunique()),
                            'sample_values': self.get_sample_values(df[exp_col])
                        }
                    else:
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': None,
                            'exists': False,
                            'type_match': False,
                            'null_count': None,
                            'unique_count': None,
                            'sample_values': []
                        }
                
                # Add unexpected columns (discovered extras)
                for col in actual_columns:
                    if col not in expected_columns:
                        result['extra_columns'][col] = {
                            'actual_type': actual_columns[col],
                            'null_count': int(df[col].isnull().sum()),
                            'unique_count': int(df[col].nunique()),
                            'sample_values': self.get_sample_values(df[col]),
                            'stats': self.get_column_stats(df[col])
                        }
                        # Store in discovered columns
                        self.discovered_columns[file_path][col] = result['extra_columns'][col]
                
            except Exception as e:
                result['error'] = str(e)
        
        return result
    
    def get_column_stats(self, series: pd.Series) -> Dict:
        """Get statistics for a column."""
        stats = {
            'total_count': int(len(series)),
            'non_null_count': int(series.count()),
            'null_percentage': float((series.isnull().sum() / len(series) * 100) if len(series) > 0 else 0)
        }
        
        if pd.api.types.is_numeric_dtype(series):
            # Handle potential NaN values
            if series.count() > 0:  # If there are non-null values
                stats.update({
                    'mean': self.convert_to_native_types(series.mean()),
                    'std': self.convert_to_native_types(series.std()),
                    'min': self.convert_to_native_types(series.min()),
                    'max': self.convert_to_native_types(series.max()),
                    'q25': self.convert_to_native_types(series.quantile(0.25)),
                    'q50': self.convert_to_native_types(series.quantile(0.50)),
                    'q75': self.convert_to_native_types(series.quantile(0.75))
                })
            else:
                # All values are null
                stats.update({
                    'mean': None,
                    'std': None,
                    'min': None,
                    'max': None,
                    'q25': None,
                    'q50': None,
                    'q75': None
                })
        elif pd.api.types.is_string_dtype(series) or series.dtype == 'object':
            value_counts = series.value_counts()
            stats.update({
                'unique_values': int(len(value_counts)),
                'most_common': {str(k): int(v) for k, v in value_counts.head(5).items()} if len(value_counts) > 0 else {}
            })
        
        return stats
    
    def check_all_files(self) -> pd.DataFrame:
        """Check all expected files and create comparison table."""
        print(f"Checking surrogate data structure in: {self.job_output_dir}\n")
        
        all_results = []
        
        for file_path, expected_info in self.expected_structure.items():
            print(f"Checking: {file_path}")
            result = self.check_parquet_file(file_path, expected_info['columns'])
            
            # Create rows for expected columns
            if result['exists']:
                for col_name, col_info in result['columns'].items():
                    all_results.append({
                        'File': file_path.split('/')[-1],
                        'Path': file_path,
                        'File_Exists': '✓',
                        'Rows': result['row_count'],
                        'Column': col_name,
                        'Column_Type': 'Expected',
                        'Expected_Type': col_info['expected_type'],
                        'Actual_Type': col_info['actual_type'],
                        'Column_Exists': '✓' if col_info['exists'] else '✗',
                        'Type_Match': '✓' if col_info['type_match'] else '✗',
                        'Null_Count': col_info['null_count'],
                        'Unique_Values': col_info['unique_count'],
                        'Sample_Values': str(col_info['sample_values'][:3]) if col_info['sample_values'] else 'N/A',
                        'Status': 'OK' if col_info['exists'] and col_info['type_match'] else 'ISSUE'
                    })
                
                # Add rows for extra discovered columns
                for col_name, col_info in result['extra_columns'].items():
                    all_results.append({
                        'File': file_path.split('/')[-1],
                        'Path': file_path,
                        'File_Exists': '✓',
                        'Rows': result['row_count'],
                        'Column': col_name,
                        'Column_Type': 'DISCOVERED',
                        'Expected_Type': 'N/A',
                        'Actual_Type': col_info['actual_type'],
                        'Column_Exists': '✓',
                        'Type_Match': 'N/A',
                        'Null_Count': col_info['null_count'],
                        'Unique_Values': col_info['unique_count'],
                        'Sample_Values': str(col_info['sample_values'][:3]) if col_info['sample_values'] else 'N/A',
                        'Status': 'EXTRA'
                    })
            else:
                all_results.append({
                    'File': file_path.split('/')[-1],
                    'Path': file_path,
                    'File_Exists': '✗',
                    'Rows': 0,
                    'Column': 'N/A',
                    'Column_Type': 'N/A',
                    'Expected_Type': 'N/A',
                    'Actual_Type': 'N/A',
                    'Column_Exists': 'N/A',
                    'Type_Match': 'N/A',
                    'Null_Count': None,
                    'Unique_Values': None,
                    'Sample_Values': 'N/A',
                    'Status': 'MISSING FILE'
                })
        
        return pd.DataFrame(all_results)
    
    def create_discovered_columns_report(self) -> pd.DataFrame:
        """Create a detailed report of all discovered extra columns."""
        discovered_data = []
        
        for file_path, columns in self.discovered_columns.items():
            for col_name, col_info in columns.items():
                row = {
                    'File': file_path.split('/')[-1],
                    'Path': file_path,
                    'Column_Name': col_name,
                    'Data_Type': col_info['actual_type'],
                    'Non_Null_Count': col_info['null_count'],
                    'Unique_Count': col_info['unique_count'],
                    'Null_Percentage': col_info['stats']['null_percentage'],
                    'Sample_Values': ', '.join(map(str, col_info['sample_values'][:5]))
                }
                
                # Add numeric stats if available
                if 'mean' in col_info['stats']:
                    row.update({
                        'Mean': col_info['stats']['mean'],
                        'Std': col_info['stats']['std'],
                        'Min': col_info['stats']['min'],
                        'Max': col_info['stats']['max']
                    })
                
                discovered_data.append(row)
        
        return pd.DataFrame(discovered_data)
    
    def create_summary_report(self, df_results: pd.DataFrame) -> Dict:
        """Create a summary report of the check results."""
        summary = {
            'check_timestamp': datetime.now().isoformat(),
            'job_output_dir': str(self.job_output_dir),
            'total_files_expected': len(self.expected_structure),
            'files_found': len(df_results[df_results['File_Exists'] == '✓']['Path'].unique()),
            'files_missing': len(df_results[df_results['File_Exists'] == '✗']['Path'].unique()),
            'total_expected_columns': len(df_results[df_results['Column_Type'] == 'Expected']),
            'expected_columns_found': len(df_results[(df_results['Column_Type'] == 'Expected') & 
                                                    (df_results['Column_Exists'] == '✓')]),
            'total_discovered_columns': len(df_results[df_results['Column_Type'] == 'DISCOVERED']),
            'total_issues': len(df_results[df_results['Status'].isin(['ISSUE', 'MISSING FILE'])]),
            'missing_files': [],
            'column_issues': [],
            'type_mismatches': [],
            'discovered_extras_summary': {}
        }
        
        # Get missing files
        missing_files = df_results[df_results['File_Exists'] == '✗']['Path'].unique()
        summary['missing_files'] = list(missing_files)
        
        # Get column issues
        column_issues = df_results[(df_results['Column_Exists'] == '✗') & 
                                 (df_results['File_Exists'] == '✓') & 
                                 (df_results['Column_Type'] == 'Expected')]
        for _, row in column_issues.iterrows():
            summary['column_issues'].append(f"{row['File']}: {row['Column']}")
        
        # Get type mismatches
        type_issues = df_results[(df_results['Type_Match'] == '✗') & 
                               (df_results['Column_Exists'] == '✓') & 
                               (df_results['Column_Type'] == 'Expected')]
        for _, row in type_issues.iterrows():
            summary['type_mismatches'].append(
                f"{row['File']}: {row['Column']} (expected {row['Expected_Type']}, got {row['Actual_Type']})"
            )
        
        # Summary of discovered columns by file
        discovered_by_file = df_results[df_results['Column_Type'] == 'DISCOVERED'].groupby('File')['Column'].count()
        summary['discovered_extras_summary'] = discovered_by_file.to_dict()
        
        return summary
    
    def print_colored_results(self, df_results: pd.DataFrame, summary: Dict):
        """Print results with color coding (if in Jupyter/IPython)."""
        try:
            from IPython.display import display, HTML
            
            # Style the dataframe
            def style_status(val):
                if val == 'OK':
                    return 'background-color: #90EE90'
                elif val == 'ISSUE':
                    return 'background-color: #FFB6C1'
                elif val == 'MISSING FILE':
                    return 'background-color: #FF6B6B'
                elif val == 'EXTRA':
                    return 'background-color: #87CEEB'
                return ''
            
            def style_check(val):
                if val == '✓':
                    return 'color: green; font-weight: bold'
                elif val == '✗':
                    return 'color: red; font-weight: bold'
                return ''
            
            def style_column_type(val):
                if val == 'DISCOVERED':
                    return 'color: blue; font-weight: bold'
                return ''
            
            styled_df = df_results.style.applymap(style_status, subset=['Status'])\
                                       .applymap(style_check, subset=['File_Exists', 'Column_Exists', 'Type_Match'])\
                                       .applymap(style_column_type, subset=['Column_Type'])
            
            display(styled_df)
            
        except ImportError:
            # Fallback to regular print
            print(df_results.to_string())
    
    def save_results(self, df_results: pd.DataFrame, summary: Dict):
        """Save all results to files."""
        output_dir = self.job_output_dir / 'surrogate_data_check'
        output_dir.mkdir(exist_ok=True)
        
        # Create timestamp for filenames
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # 1. Save detailed results
        df_results.to_csv(output_dir / f'structure_check_details_{timestamp}.csv', index=False)
        df_results.to_excel(output_dir / f'structure_check_details_{timestamp}.xlsx', index=False)
        
        # 2. Save summary
        with open(output_dir / f'structure_check_summary_{timestamp}.json', 'w') as f:
            json.dump(summary, f, indent=2)
        
        # 3. Save discovered columns detailed report
        df_discovered = self.create_discovered_columns_report()
        if not df_discovered.empty:
            df_discovered.to_csv(output_dir / f'discovered_columns_{timestamp}.csv', index=False)
            df_discovered.to_excel(output_dir / f'discovered_columns_{timestamp}.xlsx', index=False)
        
        # 4. Save discovered columns with full statistics - convert numpy types first
        discovered_columns_native = self.convert_to_native_types(self.discovered_columns)
        with open(output_dir / f'discovered_columns_full_{timestamp}.json', 'w') as f:
            json.dump(discovered_columns_native, f, indent=2)
        
        # 5. Create a markdown report
        self.create_markdown_report(output_dir / f'structure_report_{timestamp}.md', df_results, summary)
        
        # 6. Save only problematic entries for quick review
        df_issues = df_results[df_results['Status'].isin(['ISSUE', 'MISSING FILE'])]
        if not df_issues.empty:
            df_issues.to_csv(output_dir / f'issues_only_{timestamp}.csv', index=False)
        
        print(f"\nResults saved to: {output_dir}")
        print(f"Files created:")
        print(f"  - structure_check_details_{timestamp}.csv/xlsx")
        print(f"  - structure_check_summary_{timestamp}.json")
        print(f"  - discovered_columns_{timestamp}.csv/xlsx")
        print(f"  - discovered_columns_full_{timestamp}.json")
        print(f"  - structure_report_{timestamp}.md")
        if not df_issues.empty:
            print(f"  - issues_only_{timestamp}.csv")
    
    def create_markdown_report(self, output_path: Path, df_results: pd.DataFrame, summary: Dict):
        """Create a markdown report for easy reading."""
        with open(output_path, 'w') as f:
            f.write(f"# Surrogate Data Structure Check Report\n\n")
            f.write(f"**Generated:** {summary['check_timestamp']}\n\n")
            f.write(f"**Directory:** `{summary['job_output_dir']}`\n\n")
            
            f.write("## Summary\n\n")
            f.write(f"- **Files Expected:** {summary['total_files_expected']}\n")
            f.write(f"- **Files Found:** {summary['files_found']}\n")
            f.write(f"- **Files Missing:** {summary['files_missing']}\n")
            f.write(f"- **Expected Columns:** {summary['total_expected_columns']}\n")
            f.write(f"- **Expected Columns Found:** {summary['expected_columns_found']}\n")
            f.write(f"- **Extra Columns Discovered:** {summary['total_discovered_columns']}\n")
            f.write(f"- **Total Issues:** {summary['total_issues']}\n\n")
            
            if summary['missing_files']:
                f.write("## Missing Files\n\n")
                for file in summary['missing_files']:
                    f.write(f"- `{file}`\n")
                f.write("\n")
            
            if summary['discovered_extras_summary']:
                f.write("## Discovered Extra Columns Summary\n\n")
                f.write("| File | Extra Columns Count |\n")
                f.write("|------|--------------------|\n")
                for file, count in summary['discovered_extras_summary'].items():
                    f.write(f"| {file} | {count} |\n")
                f.write("\n")
            
            # Add detailed discovered columns
            df_discovered = self.create_discovered_columns_report()
            if not df_discovered.empty:
                f.write("## Discovered Columns Details\n\n")
                f.write(df_discovered.to_markdown(index=False))
                f.write("\n")

# Usage
def check_surrogate_data_structure(job_output_dir: str):
    """Main function to check surrogate data structure."""
    checker = SurrogateDataChecker(job_output_dir)
    
    # Run checks
    df_results = checker.check_all_files()
    summary = checker.create_summary_report(df_results)
    
    # Print summary
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    print(f"Check performed at: {summary['check_timestamp']}")
    print(f"Total files expected: {summary['total_files_expected']}")
    print(f"Files found: {summary['files_found']}")
    print(f"Files missing: {summary['files_missing']}")
    print(f"Expected columns total: {summary['total_expected_columns']}")
    print(f"Expected columns found: {summary['expected_columns_found']}")
    print(f"Extra columns discovered: {summary['total_discovered_columns']}")
    print(f"Total issues: {summary['total_issues']}")
    
    if summary['missing_files']:
        print(f"\nMISSING FILES: {len(summary['missing_files'])}")
        for f in summary['missing_files'][:5]:  # Show first 5
            print(f"  - {f}")
        if len(summary['missing_files']) > 5:
            print(f"  ... and {len(summary['missing_files']) - 5} more")
    
    if summary['discovered_extras_summary']:
        print(f"\nDISCOVERED EXTRA COLUMNS BY FILE:")
        for file, count in list(summary['discovered_extras_summary'].items())[:10]:
            print(f"  - {file}: {count} extra columns")
    
    # Display results
    print("\n" + "="*80)
    print("DETAILED RESULTS (First 50 rows)")
    print("="*80)
    checker.print_colored_results(df_results.head(50), summary)
    
    # Show discovered columns
    df_discovered = checker.create_discovered_columns_report()
    if not df_discovered.empty:
        print("\n" + "="*80)
        print("DISCOVERED COLUMNS DETAILS (First 20)")
        print("="*80)
        print(df_discovered.head(20).to_string())
    
    # Save results
    checker.save_results(df_results, summary)
    
    return df_results, summary, df_discovered

# Run the check
if __name__ == "__main__":
    job_dir = r"D:\Documents\daily\E_Plus_2040_py\output\5f3924b1-189c-4c95-bf33-52d0602d79d9"
    df_results, summary, df_discovered = check_surrogate_data_structure(job_dir)
    
    # Additional analysis - show files by status
    print("\n" + "="*80)
    print("FILES BY STATUS")
    print("="*80)
    status_summary = df_results.groupby('Status')['Path'].nunique()
    print(status_summary)
    
    # Show column type distribution
    print("\n" + "="*80)
    print("COLUMN TYPE DISTRIBUTION")
    print("="*80)
    column_dist = df_results['Column_Type'].value_counts()
    print(column_dist)

Checking surrogate data structure in: D:\Documents\daily\E_Plus_2040_py\output\5f3924b1-189c-4c95-bf33-52d0602d79d9

Checking: parsed_data/idf_data/by_category/lighting.parquet
Checking: parsed_data/idf_data/by_category/hvac_equipment.parquet
Checking: parsed_data/idf_data/by_category/materials_materials.parquet
Checking: parsed_data/idf_data/by_category/infiltration.parquet
Checking: parsed_data/idf_data/by_category/ventilation.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/hvac_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet
Checking: parsed_data/relationships/zone_mappings.parquet
Checking: parsed_data/metadata/building_registry.parquet
Checking: parsed_modified_results/sql_results/timeseries/aggregated/daily/zones_daily.parquet
Checking: modified_idfs/modifications_detail_*.parquet
Checking: sensitivity_results/sensitivity_fo

  styled_df = df_results.style.applymap(style_status, subset=['Status'])\


Unnamed: 0,File,Path,File_Exists,Rows,Column,Column_Type,Expected_Type,Actual_Type,Column_Exists,Type_Match,Null_Count,Unique_Values,Sample_Values,Status
0,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,building_id,Expected,str,str,✓,✓,0.0,3.0,"['4136733', '4136737', '4136738']",OK
1,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,zone_name,Expected,str,str,✓,✓,0.0,1.0,['ALL_ZONES'],OK
2,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,object_name,Expected,str,str,✓,✓,0.0,1.0,['Lights_ALL_ZONES'],OK
3,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,LightingLevel,Expected,float,,✗,✗,,,,ISSUE
4,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,FractionRadiant,Expected,float,,✗,✗,,,,ISSUE
5,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,FractionVisible,Expected,float,,✗,✗,,,,ISSUE
6,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,ScheduleName,Expected,str,,✗,✗,,,,ISSUE
7,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,object_type,DISCOVERED,,str,✓,,0.0,1.0,['LIGHTS'],EXTRA
8,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,name,DISCOVERED,,str,✓,,0.0,1.0,['Lights_ALL_ZONES'],EXTRA
9,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,zone_or_zonelist_name,DISCOVERED,,str,✓,,0.0,1.0,['ALL_ZONES'],EXTRA



DISCOVERED COLUMNS DETAILS (First 20)
                File                                               Path                                                               Column_Name Data_Type  Non_Null_Count  Unique_Count  Null_Percentage                              Sample_Values      Mean           Std  Min       Max
0   lighting.parquet  parsed_data/idf_data/by_category/lighting.parquet                                                               object_type       str               0             1              0.0                                     LIGHTS       NaN           NaN  NaN       NaN
1   lighting.parquet  parsed_data/idf_data/by_category/lighting.parquet                                                                      name       str               0             1              0.0                           Lights_ALL_ZONES       NaN           NaN  NaN       NaN
2   lighting.parquet  parsed_data/idf_data/by_category/lighting.parquet                                   

### V3

In [13]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
import json
from datetime import datetime

class SurrogateDataChecker:
    def __init__(self, job_output_dir: str):
        self.job_output_dir = Path(job_output_dir)
        self.results = []
        self.discovered_columns = {}  # Store discovered extra columns
        
        # Updated expected structure based on new pipeline
        self.expected_structure = {
            # IDF Data - Updated column names based on _map_column_names
            'parsed_data/idf_data/by_category/lighting.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'watts_per_zone_floor_area': 'float',  # Updated from LightingLevel
                    'fraction_radiant': 'float',  # Updated from FractionRadiant
                    'fraction_visible': 'float',  # Updated from FractionVisible
                    'schedule_name': 'str'  # Updated from ScheduleName
                }
            },
            'parsed_data/idf_data/by_category/hvac_equipment.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'object_type': 'str',
                    'maximum_heating_supply_air_temperature': 'float',  # Updated
                    'minimum_cooling_supply_air_temperature': 'float'   # Updated
                }
            },
            'parsed_data/idf_data/by_category/materials_materials.parquet': {
                'columns': {
                    'building_id': 'str',
                    'name': 'str',  # Updated from material_name
                    'object_name': 'str',  # Added based on mapping
                    'thickness': 'float',  # Updated from Thickness
                    'conductivity': 'float',  # Updated from Conductivity
                    'density': 'float',  # Updated from Density
                    'specific_heat': 'float'  # Updated from SpecificHeat
                }
            },
            'parsed_data/idf_data/by_category/infiltration.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'design_flow_rate_calculation_method': 'str',  # Updated
                    'design_flow_rate': 'float'  # Updated
                }
            },
            'parsed_data/idf_data/by_category/ventilation.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'outdoor_air_method': 'str',  # Updated
                    'outdoor_air_flow_per_person': 'float'  # Updated
                }
            },
            'parsed_data/idf_data/by_category/equipment.parquet': {  # Added
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'design_level': 'float',
                    'watts_per_zone_floor_area': 'float',
                    'schedule_name': 'str'
                }
            },
            'parsed_data/idf_data/by_category/dhw.parquet': {  # Added
                'columns': {
                    'building_id': 'str',
                    'object_name': 'str',
                    'heater_maximum_capacity': 'float',
                    'heater_thermal_efficiency': 'float',
                    'tank_volume': 'float'
                }
            },
            'parsed_data/idf_data/by_category/shading.parquet': {  # Added
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'shading_control_name': 'str',
                    'shading_type': 'str',
                    'shading_control_type': 'str'
                }
            },
            # SQL Results - Now supports multiple temporal resolutions
            'parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'  # May or may not exist
                }
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/hvac_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'parsed_data/sql_results/timeseries/aggregated/hourly/zones_hourly.parquet': {  # Added
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'parsed_data/sql_results/timeseries/aggregated/monthly/zones_monthly.parquet': {  # Added
                'columns': {
                    'building_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            # Relationships
            'parsed_data/relationships/zone_mappings.parquet': {
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'sql_zone_name': 'str',
                    'zone_index': 'int',
                    'floor_area': 'float',
                    'volume': 'float'
                }
            },
            'parsed_data/relationships/equipment_assignments.parquet': {  # Added
                'columns': {
                    'building_id': 'str',
                    'zone_name': 'str',
                    'equipment_type': 'str',
                    'equipment_name': 'str',
                    'assignment_type': 'str'
                }
            },
            'parsed_data/metadata/building_registry.parquet': {
                'columns': {
                    'building_id': 'str',
                    'building_type': 'str',
                    'total_floor_area': 'float',
                    'num_zones': 'int',
                    'conditioned_floor_area': 'float',  # Added
                    'total_volume': 'float'  # Added
                }
            },
            # Modified results - Updated to handle variant_id better
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/zones_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'original_building_id': 'str',  # Added - extracted from composite ID
                    'variant_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/hvac_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'original_building_id': 'str',
                    'variant_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet': {
                'columns': {
                    'building_id': 'str',
                    'original_building_id': 'str',
                    'variant_id': 'str',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            # Modified IDF data
            'parsed_modified_results/idf_data/by_category/lighting.parquet': {
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'zone_name': 'str',
                    'object_name': 'str',
                    'watts_per_zone_floor_area': 'float',
                    'fraction_radiant': 'float',
                    'fraction_visible': 'float'
                }
            },
            # Modifications tracking - Updated structure
            'modified_idfs/modifications_detail_*.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'variant_id': 'str',
                    'category': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'field_name': 'str',
                    'original_value': ['str', 'float'],
                    'new_value': ['str', 'float'],
                    'relative_change': 'float',
                    'param_id': 'str'  # Added - category*object_type*object_name*field_name
                }
            },
            # Sensitivity - Multiple possible files
            'sensitivity_results/sensitivity_for_surrogate.parquet': {
                'columns': {
                    'parameter': 'str',
                    'sensitivity_score': 'float',
                    'elasticity': 'float',
                    'p_value': 'float',
                    'confidence_level': 'str',  # Added
                    'rank': 'float'  # Added
                }
            },
            'sensitivity_results/modification_sensitivity_results.parquet': {  # Alternative
                'columns': {
                    'parameter': 'str',
                    'sensitivity_score': 'float',
                    'elasticity': 'float',
                    'output_variable': 'str',  # Added for multi-output
                    'aggregation_level': 'str'  # Added - building/zone
                }
            },
            # Validation results - New
            'validation_results/validation_summary.parquet': {
                'columns': {
                    'building_id': 'str',
                    'variable': 'str',
                    'cvrmse': 'float',
                    'nmbe': 'float',
                    'r2': 'float',
                    'mae': 'float'
                }
            },
            # Surrogate pipeline outputs - New
            'surrogate_pipeline_export/*/1_inputs/extracted_modifications.parquet': {
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'param_id': 'str',
                    'relative_change': 'float'
                }
            },
            'surrogate_pipeline_export/*/3_preprocessing/preprocessed_features.parquet': {
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    # Dynamic feature columns based on modifications
                }
            },
            'surrogate_pipeline_export/*/3_preprocessing/preprocessed_targets.parquet': {
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    # Target columns like:
                    # 'Heating_EnergyTransfer_J_Hourly_total': 'float',
                    # 'Cooling_EnergyTransfer_J_Hourly_total': 'float',
                    # 'Electricity_Facility_J_Hourly_total': 'float'
                }
            }
        }
    
    def convert_to_native_types(self, obj: Any) -> Any:
        """Convert numpy types to native Python types for JSON serialization."""
        if isinstance(obj, dict):
            return {k: self.convert_to_native_types(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self.convert_to_native_types(item) for item in obj]
        elif isinstance(obj, tuple):
            return tuple(self.convert_to_native_types(item) for item in obj)
        elif isinstance(obj, pd.Timestamp):
            # Convert pandas Timestamp to ISO format string
            return obj.isoformat()
        elif isinstance(obj, datetime):
            # Convert datetime to ISO format string
            return obj.isoformat()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            if np.isnan(obj):
                return None
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif pd.isna(obj):
            return None
        else:
            return obj
    
    def check_file_exists(self, file_path: str) -> Tuple[bool, Optional[str]]:
        """Check if file exists and return its absolute path."""
        full_path = self.job_output_dir / file_path
        
        # Also check with wildcards for files like modifications_detail_*.parquet
        if '*' in file_path:
            parent = full_path.parent
            pattern = full_path.name
            if parent.exists():
                matching_files = list(parent.glob(pattern))
                if matching_files:
                    return True, str(matching_files[0])
            return False, None
        
        return full_path.exists(), str(full_path) if full_path.exists() else None
    
    def get_dtype_string(self, dtype) -> str:
        """Convert numpy/pandas dtype to simple string."""
        dtype_str = str(dtype)
        
        if 'int' in dtype_str:
            return 'int'
        elif 'float' in dtype_str:
            return 'float'
        elif 'object' in dtype_str or 'string' in dtype_str:
            return 'str'
        elif 'datetime' in dtype_str:
            return 'datetime'
        elif 'bool' in dtype_str:
            return 'bool'
        else:
            return dtype_str
    
    def get_sample_values(self, series: pd.Series, n_samples: int = 5) -> List:
        """Get sample values from a series."""
        unique_vals = series.dropna().unique()
        if len(unique_vals) <= n_samples:
            return [self.convert_to_native_types(val) for val in unique_vals]
        else:
            # Get a mix of values
            samples = []
            if len(unique_vals) > 0:
                # Add min/max for numeric
                if pd.api.types.is_numeric_dtype(series):
                    samples.append(self.convert_to_native_types(series.min()))
                    samples.append(self.convert_to_native_types(series.max()))
                    # Add some random samples
                    remaining = n_samples - 2
                    if remaining > 0:
                        random_samples = series.dropna().sample(n=min(remaining, len(series))).tolist()
                        samples.extend([self.convert_to_native_types(val) for val in random_samples[:remaining]])
                else:
                    # For non-numeric, just take first n_samples
                    samples = [self.convert_to_native_types(val) for val in unique_vals[:n_samples]]
            return samples
    
    def check_parquet_file(self, file_path: str, expected_columns: Dict[str, any]) -> Dict:
        """Check a parquet file's structure."""
        exists, abs_path = self.check_file_exists(file_path)
        
        result = {
            'file': file_path,
            'exists': exists,
            'path': abs_path,
            'row_count': 0,
            'columns': {},
            'extra_columns': {}  # Store discovered columns
        }
        
        if exists and abs_path:
            try:
                # Read parquet file
                df = pd.read_parquet(abs_path)
                result['row_count'] = len(df)
                
                # Get actual columns and types
                actual_columns = {}
                for col in df.columns:
                    actual_columns[col] = self.get_dtype_string(df[col].dtype)
                
                # Store discovered columns info for this file
                if file_path not in self.discovered_columns:
                    self.discovered_columns[file_path] = {}
                
                # Compare with expected
                for exp_col, exp_type in expected_columns.items():
                    if exp_col in actual_columns:
                        actual_type = actual_columns[exp_col]
                        
                        # Handle multiple expected types
                        if isinstance(exp_type, list):
                            type_match = actual_type in exp_type
                        else:
                            type_match = (actual_type == exp_type) or \
                                       (exp_type == 'str' and actual_type == 'object')
                        
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': actual_type,
                            'exists': True,
                            'type_match': type_match,
                            'null_count': int(df[exp_col].isnull().sum()),
                            'unique_count': int(df[exp_col].nunique()),
                            'sample_values': self.get_sample_values(df[exp_col])
                        }
                    else:
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': None,
                            'exists': False,
                            'type_match': False,
                            'null_count': None,
                            'unique_count': None,
                            'sample_values': []
                        }
                
                # Add unexpected columns (discovered extras)
                for col in actual_columns:
                    if col not in expected_columns:
                        result['extra_columns'][col] = {
                            'actual_type': actual_columns[col],
                            'null_count': int(df[col].isnull().sum()),
                            'unique_count': int(df[col].nunique()),
                            'sample_values': self.get_sample_values(df[col]),
                            'stats': self.get_column_stats(df[col])
                        }
                        # Store in discovered columns
                        self.discovered_columns[file_path][col] = result['extra_columns'][col]
                
            except Exception as e:
                result['error'] = str(e)
        
        return result
    
    def get_column_stats(self, series: pd.Series) -> Dict:
        """Get statistics for a column."""
        stats = {
            'total_count': int(len(series)),
            'non_null_count': int(series.count()),
            'null_percentage': float((series.isnull().sum() / len(series) * 100) if len(series) > 0 else 0)
        }
        
        if pd.api.types.is_numeric_dtype(series):
            # Handle potential NaN values
            if series.count() > 0:  # If there are non-null values
                stats.update({
                    'mean': self.convert_to_native_types(series.mean()),
                    'std': self.convert_to_native_types(series.std()),
                    'min': self.convert_to_native_types(series.min()),
                    'max': self.convert_to_native_types(series.max()),
                    'q25': self.convert_to_native_types(series.quantile(0.25)),
                    'q50': self.convert_to_native_types(series.quantile(0.50)),
                    'q75': self.convert_to_native_types(series.quantile(0.75))
                })
            else:
                # All values are null
                stats.update({
                    'mean': None,
                    'std': None,
                    'min': None,
                    'max': None,
                    'q25': None,
                    'q50': None,
                    'q75': None
                })
        elif pd.api.types.is_string_dtype(series) or series.dtype == 'object':
            value_counts = series.value_counts()
            stats.update({
                'unique_values': int(len(value_counts)),
                'most_common': {str(k): int(v) for k, v in value_counts.head(5).items()} if len(value_counts) > 0 else {}
            })
        
        return stats
    
    def check_all_files(self) -> pd.DataFrame:
        """Check all expected files and create comparison table."""
        print(f"Checking surrogate data structure in: {self.job_output_dir}\n")
        
        all_results = []
        
        for file_path, expected_info in self.expected_structure.items():
            print(f"Checking: {file_path}")
            result = self.check_parquet_file(file_path, expected_info['columns'])
            
            # Create rows for expected columns
            if result['exists']:
                for col_name, col_info in result['columns'].items():
                    all_results.append({
                        'File': file_path.split('/')[-1],
                        'Path': file_path,
                        'File_Exists': '✓',
                        'Rows': result['row_count'],
                        'Column': col_name,
                        'Column_Type': 'Expected',
                        'Expected_Type': col_info['expected_type'],
                        'Actual_Type': col_info['actual_type'],
                        'Column_Exists': '✓' if col_info['exists'] else '✗',
                        'Type_Match': '✓' if col_info['type_match'] else '✗',
                        'Null_Count': col_info['null_count'],
                        'Unique_Values': col_info['unique_count'],
                        'Sample_Values': str(col_info['sample_values'][:3]) if col_info['sample_values'] else 'N/A',
                        'Status': 'OK' if col_info['exists'] and col_info['type_match'] else 'ISSUE'
                    })
                
                # Add rows for extra discovered columns
                for col_name, col_info in result['extra_columns'].items():
                    all_results.append({
                        'File': file_path.split('/')[-1],
                        'Path': file_path,
                        'File_Exists': '✓',
                        'Rows': result['row_count'],
                        'Column': col_name,
                        'Column_Type': 'DISCOVERED',
                        'Expected_Type': 'N/A',
                        'Actual_Type': col_info['actual_type'],
                        'Column_Exists': '✓',
                        'Type_Match': 'N/A',
                        'Null_Count': col_info['null_count'],
                        'Unique_Values': col_info['unique_count'],
                        'Sample_Values': str(col_info['sample_values'][:3]) if col_info['sample_values'] else 'N/A',
                        'Status': 'EXTRA'
                    })
            else:
                all_results.append({
                    'File': file_path.split('/')[-1],
                    'Path': file_path,
                    'File_Exists': '✗',
                    'Rows': 0,
                    'Column': 'N/A',
                    'Column_Type': 'N/A',
                    'Expected_Type': 'N/A',
                    'Actual_Type': 'N/A',
                    'Column_Exists': 'N/A',
                    'Type_Match': 'N/A',
                    'Null_Count': None,
                    'Unique_Values': None,
                    'Sample_Values': 'N/A',
                    'Status': 'MISSING FILE'
                })
        
        return pd.DataFrame(all_results)
    
    def create_discovered_columns_report(self) -> pd.DataFrame:
        """Create a detailed report of all discovered extra columns."""
        discovered_data = []
        
        for file_path, columns in self.discovered_columns.items():
            for col_name, col_info in columns.items():
                row = {
                    'File': file_path.split('/')[-1],
                    'Path': file_path,
                    'Column_Name': col_name,
                    'Data_Type': col_info['actual_type'],
                    'Non_Null_Count': col_info['null_count'],
                    'Unique_Count': col_info['unique_count'],
                    'Null_Percentage': col_info['stats']['null_percentage'],
                    'Sample_Values': ', '.join(map(str, col_info['sample_values'][:5]))
                }
                
                # Add numeric stats if available
                if 'mean' in col_info['stats']:
                    row.update({
                        'Mean': col_info['stats']['mean'],
                        'Std': col_info['stats']['std'],
                        'Min': col_info['stats']['min'],
                        'Max': col_info['stats']['max']
                    })
                
                discovered_data.append(row)
        
        return pd.DataFrame(discovered_data)
    
    def create_summary_report(self, df_results: pd.DataFrame) -> Dict:
        """Create a summary report of the check results."""
        summary = {
            'check_timestamp': datetime.now().isoformat(),
            'job_output_dir': str(self.job_output_dir),
            'total_files_expected': len(self.expected_structure),
            'files_found': len(df_results[df_results['File_Exists'] == '✓']['Path'].unique()),
            'files_missing': len(df_results[df_results['File_Exists'] == '✗']['Path'].unique()),
            'total_expected_columns': len(df_results[df_results['Column_Type'] == 'Expected']),
            'expected_columns_found': len(df_results[(df_results['Column_Type'] == 'Expected') & 
                                                    (df_results['Column_Exists'] == '✓')]),
            'total_discovered_columns': len(df_results[df_results['Column_Type'] == 'DISCOVERED']),
            'total_issues': len(df_results[df_results['Status'].isin(['ISSUE', 'MISSING FILE'])]),
            'missing_files': [],
            'column_issues': [],
            'type_mismatches': [],
            'discovered_extras_summary': {},
            'data_pipeline_readiness': {  # New section
                'has_base_outputs': False,
                'has_modified_outputs': False,
                'has_modifications': False,
                'has_sensitivity': False,
                'has_zone_mappings': False,
                'ready_for_surrogate': False
            }
        }
        
        # Get missing files
        missing_files = df_results[df_results['File_Exists'] == '✗']['Path'].unique()
        summary['missing_files'] = list(missing_files)
        
        # Get column issues
        column_issues = df_results[(df_results['Column_Exists'] == '✗') & 
                                 (df_results['File_Exists'] == '✓') & 
                                 (df_results['Column_Type'] == 'Expected')]
        for _, row in column_issues.iterrows():
            summary['column_issues'].append(f"{row['File']}: {row['Column']}")
        
        # Get type mismatches
        type_issues = df_results[(df_results['Type_Match'] == '✗') & 
                               (df_results['Column_Exists'] == '✓') & 
                               (df_results['Column_Type'] == 'Expected')]
        for _, row in type_issues.iterrows():
            summary['type_mismatches'].append(
                f"{row['File']}: {row['Column']} (expected {row['Expected_Type']}, got {row['Actual_Type']})"
            )
        
        # Summary of discovered columns by file
        discovered_by_file = df_results[df_results['Column_Type'] == 'DISCOVERED'].groupby('File')['Column'].count()
        summary['discovered_extras_summary'] = discovered_by_file.to_dict()
        
        # Check data pipeline readiness
        files_exist = df_results[df_results['File_Exists'] == '✓']['Path'].unique()
        summary['data_pipeline_readiness']['has_base_outputs'] = any('parsed_data/sql_results' in f for f in files_exist)
        summary['data_pipeline_readiness']['has_modified_outputs'] = any('parsed_modified_results/sql_results' in f for f in files_exist)
        summary['data_pipeline_readiness']['has_modifications'] = any('modifications_detail' in f for f in files_exist)
        summary['data_pipeline_readiness']['has_sensitivity'] = any('sensitivity_results' in f for f in files_exist)
        summary['data_pipeline_readiness']['has_zone_mappings'] = any('zone_mappings' in f for f in files_exist)
        
        # Check if ready for surrogate modeling
        summary['data_pipeline_readiness']['ready_for_surrogate'] = (
            summary['data_pipeline_readiness']['has_base_outputs'] and
            summary['data_pipeline_readiness']['has_modified_outputs'] and
            summary['data_pipeline_readiness']['has_modifications']
        )
        
        return summary
    
    def print_colored_results(self, df_results: pd.DataFrame, summary: Dict):
        """Print results with color coding (if in Jupyter/IPython)."""
        try:
            from IPython.display import display, HTML
            
            # Style the dataframe
            def style_status(val):
                if val == 'OK':
                    return 'background-color: #90EE90'
                elif val == 'ISSUE':
                    return 'background-color: #FFB6C1'
                elif val == 'MISSING FILE':
                    return 'background-color: #FF6B6B'
                elif val == 'EXTRA':
                    return 'background-color: #87CEEB'
                return ''
            
            def style_check(val):
                if val == '✓':
                    return 'color: green; font-weight: bold'
                elif val == '✗':
                    return 'color: red; font-weight: bold'
                return ''
            
            def style_column_type(val):
                if val == 'DISCOVERED':
                    return 'color: blue; font-weight: bold'
                return ''
            
            styled_df = df_results.style.applymap(style_status, subset=['Status'])\
                                       .applymap(style_check, subset=['File_Exists', 'Column_Exists', 'Type_Match'])\
                                       .applymap(style_column_type, subset=['Column_Type'])
            
            display(styled_df)
            
        except ImportError:
            # Fallback to regular print
            print(df_results.to_string())
    
    def save_results(self, df_results: pd.DataFrame, summary: Dict):
        """Save all results to files."""
        output_dir = self.job_output_dir / 'surrogate_data_check'
        output_dir.mkdir(exist_ok=True)
        
        # Create timestamp for filenames
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # 1. Save detailed results
        df_results.to_csv(output_dir / f'structure_check_details_{timestamp}.csv', index=False)
        df_results.to_excel(output_dir / f'structure_check_details_{timestamp}.xlsx', index=False)
        
        # 2. Save summary
        with open(output_dir / f'structure_check_summary_{timestamp}.json', 'w') as f:
            json.dump(summary, f, indent=2)
        
        # 3. Save discovered columns detailed report
        df_discovered = self.create_discovered_columns_report()
        if not df_discovered.empty:
            df_discovered.to_csv(output_dir / f'discovered_columns_{timestamp}.csv', index=False)
            df_discovered.to_excel(output_dir / f'discovered_columns_{timestamp}.xlsx', index=False)
        
        # 4. Save discovered columns with full statistics - convert numpy types first
        discovered_columns_native = self.convert_to_native_types(self.discovered_columns)
        with open(output_dir / f'discovered_columns_full_{timestamp}.json', 'w') as f:
            json.dump(discovered_columns_native, f, indent=2)
        
        # 5. Create a markdown report
        self.create_markdown_report(output_dir / f'structure_report_{timestamp}.md', df_results, summary)
        
        # 6. Save only problematic entries for quick review
        df_issues = df_results[df_results['Status'].isin(['ISSUE', 'MISSING FILE'])]
        if not df_issues.empty:
            df_issues.to_csv(output_dir / f'issues_only_{timestamp}.csv', index=False)
        
        print(f"\nResults saved to: {output_dir}")
        print(f"Files created:")
        print(f"  - structure_check_details_{timestamp}.csv/xlsx")
        print(f"  - structure_check_summary_{timestamp}.json")
        print(f"  - discovered_columns_{timestamp}.csv/xlsx")
        print(f"  - discovered_columns_full_{timestamp}.json")
        print(f"  - structure_report_{timestamp}.md")
        if not df_issues.empty:
            print(f"  - issues_only_{timestamp}.csv")
    
    def create_markdown_report(self, output_path: Path, df_results: pd.DataFrame, summary: Dict):
        """Create a markdown report for easy reading."""
        with open(output_path, 'w') as f:
            f.write(f"# Surrogate Data Structure Check Report\n\n")
            f.write(f"**Generated:** {summary['check_timestamp']}\n\n")
            f.write(f"**Directory:** `{summary['job_output_dir']}`\n\n")
            
            f.write("## Summary\n\n")
            f.write(f"- **Files Expected:** {summary['total_files_expected']}\n")
            f.write(f"- **Files Found:** {summary['files_found']}\n")
            f.write(f"- **Files Missing:** {summary['files_missing']}\n")
            f.write(f"- **Expected Columns:** {summary['total_expected_columns']}\n")
            f.write(f"- **Expected Columns Found:** {summary['expected_columns_found']}\n")
            f.write(f"- **Extra Columns Discovered:** {summary['total_discovered_columns']}\n")
            f.write(f"- **Total Issues:** {summary['total_issues']}\n\n")
            
            # Add pipeline readiness section
            f.write("## Data Pipeline Readiness\n\n")
            readiness = summary['data_pipeline_readiness']
            f.write(f"- **Base Outputs:** {'✓' if readiness['has_base_outputs'] else '✗'}\n")
            f.write(f"- **Modified Outputs:** {'✓' if readiness['has_modified_outputs'] else '✗'}\n")
            f.write(f"- **Modifications:** {'✓' if readiness['has_modifications'] else '✗'}\n")
            f.write(f"- **Sensitivity Analysis:** {'✓' if readiness['has_sensitivity'] else '✗'}\n")
            f.write(f"- **Zone Mappings:** {'✓' if readiness['has_zone_mappings'] else '✗'}\n")
            f.write(f"- **Ready for Surrogate Modeling:** {'✓' if readiness['ready_for_surrogate'] else '✗'}\n\n")
            
            if summary['missing_files']:
                f.write("## Missing Files\n\n")
                for file in summary['missing_files']:
                    f.write(f"- `{file}`\n")
                f.write("\n")
            
            if summary['discovered_extras_summary']:
                f.write("## Discovered Extra Columns Summary\n\n")
                f.write("| File | Extra Columns Count |\n")
                f.write("|------|--------------------|\n")
                for file, count in summary['discovered_extras_summary'].items():
                    f.write(f"| {file} | {count} |\n")
                f.write("\n")
            
            # Add detailed discovered columns
            df_discovered = self.create_discovered_columns_report()
            if not df_discovered.empty:
                f.write("## Discovered Columns Details\n\n")
                f.write(df_discovered.to_markdown(index=False))
                f.write("\n")

# Usage
def check_surrogate_data_structure(job_output_dir: str):
    """Main function to check surrogate data structure."""
    checker = SurrogateDataChecker(job_output_dir)
    
    # Run checks
    df_results = checker.check_all_files()
    summary = checker.create_summary_report(df_results)
    
    # Print summary
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    print(f"Check performed at: {summary['check_timestamp']}")
    print(f"Total files expected: {summary['total_files_expected']}")
    print(f"Files found: {summary['files_found']}")
    print(f"Files missing: {summary['files_missing']}")
    print(f"Expected columns total: {summary['total_expected_columns']}")
    print(f"Expected columns found: {summary['expected_columns_found']}")
    print(f"Extra columns discovered: {summary['total_discovered_columns']}")
    print(f"Total issues: {summary['total_issues']}")
    
    # Print pipeline readiness
    print("\nDATA PIPELINE READINESS:")
    readiness = summary['data_pipeline_readiness']
    print(f"  - Base Outputs: {'✓' if readiness['has_base_outputs'] else '✗'}")
    print(f"  - Modified Outputs: {'✓' if readiness['has_modified_outputs'] else '✗'}")
    print(f"  - Modifications: {'✓' if readiness['has_modifications'] else '✗'}")
    print(f"  - Sensitivity Analysis: {'✓' if readiness['has_sensitivity'] else '✗'}")
    print(f"  - Zone Mappings: {'✓' if readiness['has_zone_mappings'] else '✗'}")
    print(f"  - Ready for Surrogate: {'✓ YES' if readiness['ready_for_surrogate'] else '✗ NO'}")
    
    if summary['missing_files']:
        print(f"\nMISSING FILES: {len(summary['missing_files'])}")
        for f in summary['missing_files'][:5]:  # Show first 5
            print(f"  - {f}")
        if len(summary['missing_files']) > 5:
            print(f"  ... and {len(summary['missing_files']) - 5} more")
    
    if summary['discovered_extras_summary']:
        print(f"\nDISCOVERED EXTRA COLUMNS BY FILE:")
        for file, count in list(summary['discovered_extras_summary'].items())[:10]:
            print(f"  - {file}: {count} extra columns")
    
    # Display results
    print("\n" + "="*80)
    print("DETAILED RESULTS (First 50 rows)")
    print("="*80)
    checker.print_colored_results(df_results.head(50), summary)
    
    # Show discovered columns
    df_discovered = checker.create_discovered_columns_report()
    if not df_discovered.empty:
        print("\n" + "="*80)
        print("DISCOVERED COLUMNS DETAILS (First 20)")
        print("="*80)
        print(df_discovered.head(20).to_string())
    
    # Save results
    checker.save_results(df_results, summary)
    
    return df_results, summary, df_discovered

# Run the check
if __name__ == "__main__":
    job_dir = r"D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff"
    df_results, summary, df_discovered = check_surrogate_data_structure(job_dir)
    
    # Additional analysis - show files by status
    print("\n" + "="*80)
    print("FILES BY STATUS")
    print("="*80)
    status_summary = df_results.groupby('Status')['Path'].nunique()
    print(status_summary)
    
    # Show column type distribution
    print("\n" + "="*80)
    print("COLUMN TYPE DISTRIBUTION")
    print("="*80)
    column_dist = df_results['Column_Type'].value_counts()
    print(column_dist)

Checking surrogate data structure in: D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff

Checking: parsed_data/idf_data/by_category/lighting.parquet
Checking: parsed_data/idf_data/by_category/hvac_equipment.parquet
Checking: parsed_data/idf_data/by_category/materials_materials.parquet
Checking: parsed_data/idf_data/by_category/infiltration.parquet
Checking: parsed_data/idf_data/by_category/ventilation.parquet
Checking: parsed_data/idf_data/by_category/equipment.parquet
Checking: parsed_data/idf_data/by_category/dhw.parquet
Checking: parsed_data/idf_data/by_category/shading.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/hvac_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/hourly/zones_hourly.parquet
Checking: parsed_data/sql_results/timeseries/aggr

Unnamed: 0,File,Path,File_Exists,Rows,Column,Column_Type,Expected_Type,Actual_Type,Column_Exists,Type_Match,Null_Count,Unique_Values,Sample_Values,Status
0,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,building_id,Expected,str,str,✓,✓,0.0,3.0,"['4136733', '4136737', '4136738']",OK
1,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,zone_name,Expected,str,str,✓,✓,0.0,1.0,['ALL_ZONES'],OK
2,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,object_name,Expected,str,str,✓,✓,0.0,1.0,['Lights_ALL_ZONES'],OK
3,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,watts_per_zone_floor_area,Expected,float,str,✓,✗,0.0,3.0,"['0', '4.092524414409712', '4.980486110210785']",ISSUE
4,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,fraction_radiant,Expected,float,str,✓,✗,0.0,1.0,['0.1'],ISSUE
5,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,fraction_visible,Expected,float,str,✓,✗,0.0,1.0,['0.1'],ISSUE
6,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,schedule_name,Expected,str,str,✓,✓,0.0,1.0,['LightsSchedule'],OK
7,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,object_type,DISCOVERED,,str,✓,,0.0,1.0,['LIGHTS'],EXTRA
8,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,name,DISCOVERED,,str,✓,,0.0,1.0,['Lights_ALL_ZONES'],EXTRA
9,lighting.parquet,parsed_data/idf_data/by_category/lighting.parquet,✓,3,zone_or_zonelist_name,DISCOVERED,,str,✓,,0.0,1.0,['ALL_ZONES'],EXTRA



DISCOVERED COLUMNS DETAILS (First 20)
                      File                                                     Path                                                               Column_Name Data_Type  Non_Null_Count  Unique_Count  Null_Percentage                                                                                                                                                 Sample_Values      Mean           Std  Min       Max
0         lighting.parquet        parsed_data/idf_data/by_category/lighting.parquet                                                               object_type       str               0             1         0.000000                                                                                                                                                        LIGHTS       NaN           NaN  NaN       NaN
1         lighting.parquet        parsed_data/idf_data/by_category/lighting.parquet                                                      

# Sensitivity

## check v1

In [5]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
import json
from datetime import datetime
import glob

class SensitivityDataChecker:
    def __init__(self, job_output_dir: str):
        self.job_output_dir = Path(job_output_dir)
        self.results = []
        self.discovered_columns = {}
        
        # Expected structure for sensitivity analysis pipeline
        self.expected_structure = {
            # === MODIFICATION TRACKING ===
            'modified_idfs/modifications_detail_*.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'category': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'field_name': 'str',
                    'original_value': ['str', 'float'],
                    'new_value': ['str', 'float'],
                    'original_value_numeric': 'float',
                    'new_value_numeric': 'float',
                    'param_delta': 'float',
                    'param_pct_change': 'float',
                    'param_key': 'str'  # category*object_type*object_name*field_name
                },
                'required_for': ['modification_based']
            },
            
            # === BASE SIMULATION RESULTS ===
            'parsed_data/sql_results/timeseries/aggregated/daily/hvac_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime',
                    'Zone': 'str'  # Optional
                },
                'required_for': ['all']
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/energy_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['all']
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/electricity_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['all']
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['zone_level', 'multi_level']
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/temperature_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['zone_level']
            },
            'parsed_data/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['zone_level']
            },
            
            # === HOURLY DATA (for time-of-day analysis) ===
            'parsed_data/sql_results/timeseries/hourly/hvac_2013.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime',
                    'Zone': 'str'
                },
                'required_for': ['time_slicing_hourly']
            },
            
            # === MODIFIED SIMULATION RESULTS ===
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/hvac_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime',
                    'Zone': 'str'
                },
                'required_for': ['modification_based']
            },
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/energy_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['modification_based']
            },
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/electricity_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['modification_based']
            },
            'parsed_modified_results/sql_results/timeseries/aggregated/daily/zones_daily.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float',
                    'DateTime': 'datetime'
                },
                'required_for': ['modification_based_zone_level']
            },
            
            # === RELATIONSHIPS FOR MULTI-LEVEL ===
            'parsed_data/relationships/zone_mappings.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'idf_zone_name': 'str',
                    'sql_zone_name': 'str',
                    'zone_type': 'str',
                    'multiplier': 'float'
                },
                'required_for': ['zone_level', 'multi_level']
            },
            'parsed_data/relationships/equipment_assignments.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'equipment_name': 'str',
                    'equipment_type': 'str',
                    'assigned_zone': 'str',
                    'schedule': 'str'
                },
                'required_for': ['equipment_level', 'multi_level']
            },
            
            # === TRADITIONAL ANALYSIS INPUTS ===
            'parsed_data/idf_data/by_category/*.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'category': 'str',
                    'field': 'str',
                    'value': ['str', 'float'],
                    'value_numeric': 'float'
                },
                'required_for': ['traditional']
            },
            'parsed_data/analysis_ready/parameter_matrix.parquet': {
                'columns': {
                    'building_id': ['str', 'int']
                    # Dynamic columns based on parameters
                },
                'required_for': ['traditional_optimized']
            },
            
            # === BUILDING METADATA ===
            'parsed_data/metadata/building_registry.parquet': {
                'columns': {
                    'building_id': ['str', 'int'],
                    'building_type': 'str',
                    'total_floor_area': 'float',
                    'num_zones': 'int'
                },
                'required_for': ['all']
            },
            
            # === SENSITIVITY OUTPUTS ===
            'sensitivity_results/modification_sensitivity_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'sensitivity_score': 'float',
                    'method': 'str',
                    'level': 'str',  # building/zone/equipment
                    'p_value': 'float',
                    'confidence_lower': 'float',
                    'confidence_upper': 'float',
                    'category': 'str',
                    'elasticity': 'float',
                    'n_samples': 'int'
                },
                'output_from': ['modification_based']
            },
            'sensitivity_results/traditional_sensitivity_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'sensitivity_score': 'float',
                    'method': 'str',
                    'correlation': 'float',
                    'p_value': 'float',
                    'n_samples': 'int'
                },
                'output_from': ['traditional']
            },
            'sensitivity_results/hybrid_sensitivity_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'sensitivity_score': 'float',
                    'analysis_source': 'str',
                    'consensus_score': 'float',
                    'score_std': 'float',
                    'n_methods': 'int'
                },
                'output_from': ['hybrid']
            },
            
            # === TIME SLICE OUTPUTS ===
            'sensitivity_results/parameter_stability_across_time_slices.parquet': {
                'columns': {
                    'parameter': 'str',
                    'mean_score': 'float',
                    'std_score': 'float',
                    'cv': 'float',
                    'present_in_slices': 'str',
                    'n_slices': 'int'
                },
                'output_from': ['time_slicing']
            },
            'sensitivity_results/*_sensitivity_results_*.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'sensitivity_score': 'float',
                    'time_slice': 'str'
                },
                'output_from': ['time_slicing']
            },
            
            # === ADVANCED ANALYSIS OUTPUTS ===
            'sensitivity_results/uncertainty_analysis_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'sensitivity_score': 'float',
                    'uncertainty_lower': 'float',
                    'uncertainty_upper': 'float',
                    'uncertainty_std': 'float',
                    'confidence_level': 'float'
                },
                'output_from': ['uncertainty']
            },
            'sensitivity_results/threshold_analysis_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'breakpoint_value': 'float',
                    'segment_index': 'int',
                    'segment_start': 'float',
                    'segment_end': 'float',
                    'segment_sensitivity': 'float',
                    'is_critical_region': 'bool'
                },
                'output_from': ['threshold']
            },
            'sensitivity_results/regional_sensitivity_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'sensitivity_score': 'float',
                    'region_id': 'int',
                    'parameter_mean': 'float',
                    'parameter_std': 'float',
                    'local_correlation': 'float',
                    'local_nonlinearity': 'float'
                },
                'output_from': ['regional']
            },
            'sensitivity_results/sobol_analysis_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'sobol_index': 'float',
                    'sobol_type': 'str',  # first_order/second_order/total
                    'total_effect': 'float',
                    'confidence_lower': 'float',
                    'confidence_upper': 'float'
                },
                'output_from': ['sobol']
            },
            'sensitivity_results/temporal_pattern_results.parquet': {
                'columns': {
                    'parameter': 'str',
                    'output_variable': 'str',
                    'method': 'str',
                    'dominant_frequency': 'float',
                    'dominant_period': 'float',
                    'has_seasonality': 'bool',
                    'seasonal_period': 'float',
                    'trend_slope': 'float'
                },
                'output_from': ['temporal']
            },
            
            # === EXPORT FILES ===
            'sensitivity_results/top_sensitive_parameters.csv': {
                'columns': {
                    'parameter': 'str',
                    'sensitivity_score': 'float',
                    'analysis_method': 'str',
                    'category': 'str',
                    'object_name': 'str',
                    'field_name': 'str'
                },
                'output_from': ['export_for_surrogate']
            },
            'sensitivity_results/calibration_parameters.json': {
                'columns': {},  # JSON file
                'output_from': ['export_for_calibration']
            },
            
            # === REPORTS ===
            'sensitivity_results/modification_sensitivity_report.json': {
                'columns': {},  # JSON report
                'output_from': ['modification_based']
            },
            'sensitivity_results/sensitivity_summary.json': {
                'columns': {},  # JSON summary
                'output_from': ['all']
            },
            'sensitivity_results/time_slice_sensitivity_summary.json': {
                'columns': {},  # JSON summary
                'output_from': ['time_slicing']
            },
            'sensitivity_results/advanced_sensitivity_report.json': {
                'columns': {},  # JSON report
                'output_from': ['advanced']
            }
        }
    
    def convert_to_native_types(self, obj: Any) -> Any:
        """Convert numpy types to native Python types for JSON serialization."""
        if isinstance(obj, dict):
            return {k: self.convert_to_native_types(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self.convert_to_native_types(item) for item in obj]
        elif isinstance(obj, tuple):
            return tuple(self.convert_to_native_types(item) for item in obj)
        elif isinstance(obj, pd.Timestamp):
            return obj.isoformat()
        elif isinstance(obj, datetime):
            return obj.isoformat()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            if np.isnan(obj):
                return None
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif pd.isna(obj):
            return None
        else:
            return obj
    
    def check_file_exists(self, file_path: str) -> Tuple[bool, Optional[str]]:
        """Check if file exists and return its absolute path."""
        full_path = self.job_output_dir / file_path
        
        # Handle wildcards
        if '*' in file_path:
            parent = full_path.parent
            pattern = full_path.name
            if parent.exists():
                matching_files = list(parent.glob(pattern))
                if matching_files:
                    return True, str(matching_files[0])
            return False, None
        
        return full_path.exists(), str(full_path) if full_path.exists() else None
    
    def get_dtype_string(self, dtype) -> str:
        """Convert numpy/pandas dtype to simple string."""
        dtype_str = str(dtype)
        
        if 'int' in dtype_str:
            return 'int'
        elif 'float' in dtype_str:
            return 'float'
        elif 'object' in dtype_str or 'string' in dtype_str:
            return 'str'
        elif 'datetime' in dtype_str:
            return 'datetime'
        elif 'bool' in dtype_str:
            return 'bool'
        else:
            return dtype_str
    
    def get_sample_values(self, series: pd.Series, n_samples: int = 5) -> List:
        """Get sample values from a series."""
        unique_vals = series.dropna().unique()
        if len(unique_vals) <= n_samples:
            return [self.convert_to_native_types(val) for val in unique_vals]
        else:
            samples = []
            if pd.api.types.is_numeric_dtype(series):
                samples.append(self.convert_to_native_types(series.min()))
                samples.append(self.convert_to_native_types(series.max()))
                remaining = n_samples - 2
                if remaining > 0:
                    random_samples = series.dropna().sample(n=min(remaining, len(series))).tolist()
                    samples.extend([self.convert_to_native_types(val) for val in random_samples[:remaining]])
            else:
                samples = [self.convert_to_native_types(val) for val in unique_vals[:n_samples]]
            return samples
    
    def check_parquet_file(self, file_path: str, expected_info: Dict) -> Dict:
        """Check a parquet file's structure."""
        exists, abs_path = self.check_file_exists(file_path)
        expected_columns = expected_info.get('columns', {})
        
        result = {
            'file': file_path,
            'exists': exists,
            'path': abs_path,
            'row_count': 0,
            'columns': {},
            'extra_columns': {},
            'required_for': expected_info.get('required_for', []),
            'output_from': expected_info.get('output_from', [])
        }
        
        if exists and abs_path:
            try:
                df = pd.read_parquet(abs_path)
                result['row_count'] = len(df)
                
                # Get actual columns and types
                actual_columns = {}
                for col in df.columns:
                    actual_columns[col] = self.get_dtype_string(df[col].dtype)
                
                # Store discovered columns
                if file_path not in self.discovered_columns:
                    self.discovered_columns[file_path] = {}
                
                # Compare with expected
                for exp_col, exp_type in expected_columns.items():
                    if exp_col in actual_columns:
                        actual_type = actual_columns[exp_col]
                        
                        if isinstance(exp_type, list):
                            type_match = actual_type in exp_type
                        else:
                            type_match = (actual_type == exp_type) or \
                                       (exp_type == 'str' and actual_type == 'object')
                        
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': actual_type,
                            'exists': True,
                            'type_match': type_match,
                            'null_count': int(df[exp_col].isnull().sum()),
                            'unique_count': int(df[exp_col].nunique()),
                            'sample_values': self.get_sample_values(df[exp_col])
                        }
                    else:
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': None,
                            'exists': False,
                            'type_match': False,
                            'null_count': None,
                            'unique_count': None,
                            'sample_values': []
                        }
                
                # Find extra columns
                for col in actual_columns:
                    if col not in expected_columns:
                        result['extra_columns'][col] = {
                            'actual_type': actual_columns[col],
                            'null_count': int(df[col].isnull().sum()),
                            'unique_count': int(df[col].nunique()),
                            'sample_values': self.get_sample_values(df[col]),
                            'stats': self.get_column_stats(df[col])
                        }
                        self.discovered_columns[file_path][col] = result['extra_columns'][col]
                
                # Special checks for sensitivity results
                if 'sensitivity_score' in df.columns:
                    result['sensitivity_range'] = {
                        'min': float(df['sensitivity_score'].min()),
                        'max': float(df['sensitivity_score'].max()),
                        'mean': float(df['sensitivity_score'].mean()),
                        'std': float(df['sensitivity_score'].std())
                    }
                
                if 'parameter' in df.columns:
                    result['n_parameters'] = int(df['parameter'].nunique())
                    result['top_parameters'] = df.nlargest(5, 'sensitivity_score')[['parameter', 'sensitivity_score']].to_dict('records') if 'sensitivity_score' in df.columns else []
                
            except Exception as e:
                result['error'] = str(e)
        
        return result
    
    def check_json_file(self, file_path: str) -> Dict:
        """Check a JSON file's structure."""
        exists, abs_path = self.check_file_exists(file_path)
        
        result = {
            'file': file_path,
            'exists': exists,
            'path': abs_path,
            'content_type': 'json'
        }
        
        if exists and abs_path:
            try:
                with open(abs_path, 'r') as f:
                    content = json.load(f)
                
                result['keys'] = list(content.keys()) if isinstance(content, dict) else ['list_content']
                result['size_bytes'] = os.path.getsize(abs_path)
                
            except Exception as e:
                result['error'] = str(e)
        
        return result
    
    def get_column_stats(self, series: pd.Series) -> Dict:
        """Get statistics for a column."""
        stats = {
            'total_count': int(len(series)),
            'non_null_count': int(series.count()),
            'null_percentage': float((series.isnull().sum() / len(series) * 100) if len(series) > 0 else 0)
        }
        
        if pd.api.types.is_numeric_dtype(series):
            if series.count() > 0:
                stats.update({
                    'mean': self.convert_to_native_types(series.mean()),
                    'std': self.convert_to_native_types(series.std()),
                    'min': self.convert_to_native_types(series.min()),
                    'max': self.convert_to_native_types(series.max()),
                    'q25': self.convert_to_native_types(series.quantile(0.25)),
                    'q50': self.convert_to_native_types(series.quantile(0.50)),
                    'q75': self.convert_to_native_types(series.quantile(0.75))
                })
        elif pd.api.types.is_string_dtype(series) or series.dtype == 'object':
            value_counts = series.value_counts()
            stats.update({
                'unique_values': int(len(value_counts)),
                'most_common': {str(k): int(v) for k, v in value_counts.head(5).items()} if len(value_counts) > 0 else {}
            })
        
        return stats
    
    def check_sensitivity_configuration(self) -> Dict:
        """Check what type of sensitivity analysis was configured based on available files."""
        config_check = {
            'analysis_types': [],
            'multi_level': False,
            'time_slicing': False,
            'advanced_analysis': {
                'uncertainty': False,
                'threshold': False,
                'regional': False,
                'sobol': False,
                'temporal': False
            }
        }
        
        # Check for modification-based
        if self.check_file_exists('modified_idfs/modifications_detail_*.parquet')[0]:
            config_check['analysis_types'].append('modification_based')
        
        # Check for traditional
        if self.check_file_exists('parsed_data/idf_data/by_category/*.parquet')[0]:
            config_check['analysis_types'].append('traditional')
        
        # Check for hybrid
        if self.check_file_exists('sensitivity_results/hybrid_sensitivity_results.parquet')[0]:
            config_check['analysis_types'].append('hybrid')
        
        # Check for multi-level
        if (self.check_file_exists('parsed_data/relationships/zone_mappings.parquet')[0] and
            self.check_file_exists('parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet')[0]):
            config_check['multi_level'] = True
        
        # Check for time slicing
        if (self.check_file_exists('sensitivity_results/parameter_stability_across_time_slices.parquet')[0] or
            glob.glob(str(self.job_output_dir / 'sensitivity_results/*_sensitivity_results_*.parquet'))):
            config_check['time_slicing'] = True
        
        # Check advanced analyses
        advanced_files = {
            'uncertainty': 'uncertainty_analysis_results.parquet',
            'threshold': 'threshold_analysis_results.parquet',
            'regional': 'regional_sensitivity_results.parquet',
            'sobol': 'sobol_analysis_results.parquet',
            'temporal': 'temporal_pattern_results.parquet'
        }
        
        for analysis, filename in advanced_files.items():
            if self.check_file_exists(f'sensitivity_results/{filename}')[0]:
                config_check['advanced_analysis'][analysis] = True
        
        return config_check
    
    def check_all_files(self) -> pd.DataFrame:
        """Check all expected files and create comparison table."""
        print(f"Checking sensitivity data structure in: {self.job_output_dir}\n")
        
        all_results = []
        
        for file_path, expected_info in self.expected_structure.items():
            print(f"Checking: {file_path}")
            
            if file_path.endswith('.json'):
                result = self.check_json_file(file_path)
                all_results.append({
                    'File': file_path.split('/')[-1],
                    'Path': file_path,
                    'File_Exists': '✓' if result['exists'] else '✗',
                    'File_Type': 'JSON',
                    'Status': 'OK' if result['exists'] else 'MISSING',
                    'Required_For': ', '.join(expected_info.get('required_for', [])),
                    'Output_From': ', '.join(expected_info.get('output_from', []))
                })
            else:
                result = self.check_parquet_file(file_path, expected_info)
                
                if result['exists']:
                    # Add rows for expected columns
                    for col_name, col_info in result['columns'].items():
                        all_results.append({
                            'File': file_path.split('/')[-1],
                            'Path': file_path,
                            'File_Exists': '✓',
                            'File_Type': 'Parquet',
                            'Rows': result['row_count'],
                            'Column': col_name,
                            'Column_Type': 'Expected',
                            'Expected_Type': col_info['expected_type'],
                            'Actual_Type': col_info['actual_type'],
                            'Column_Exists': '✓' if col_info['exists'] else '✗',
                            'Type_Match': '✓' if col_info['type_match'] else '✗',
                            'Null_Count': col_info['null_count'],
                            'Unique_Values': col_info['unique_count'],
                            'Sample_Values': str(col_info['sample_values'][:3]) if col_info['sample_values'] else 'N/A',
                            'Status': 'OK' if col_info['exists'] and col_info['type_match'] else 'ISSUE',
                            'Required_For': ', '.join(result.get('required_for', [])),
                            'Output_From': ', '.join(result.get('output_from', []))
                        })
                    
                    # Add rows for extra columns
                    for col_name, col_info in result['extra_columns'].items():
                        all_results.append({
                            'File': file_path.split('/')[-1],
                            'Path': file_path,
                            'File_Exists': '✓',
                            'File_Type': 'Parquet',
                            'Rows': result['row_count'],
                            'Column': col_name,
                            'Column_Type': 'DISCOVERED',
                            'Expected_Type': 'N/A',
                            'Actual_Type': col_info['actual_type'],
                            'Column_Exists': '✓',
                            'Type_Match': 'N/A',
                            'Null_Count': col_info['null_count'],
                            'Unique_Values': col_info['unique_count'],
                            'Sample_Values': str(col_info['sample_values'][:3]) if col_info['sample_values'] else 'N/A',
                            'Status': 'EXTRA',
                            'Required_For': 'N/A',
                            'Output_From': 'N/A'
                        })
                else:
                    all_results.append({
                        'File': file_path.split('/')[-1],
                        'Path': file_path,
                        'File_Exists': '✗',
                        'File_Type': 'Parquet',
                        'Rows': 0,
                        'Column': 'N/A',
                        'Column_Type': 'N/A',
                        'Expected_Type': 'N/A',
                        'Actual_Type': 'N/A',
                        'Column_Exists': 'N/A',
                        'Type_Match': 'N/A',
                        'Status': 'MISSING FILE',
                        'Required_For': ', '.join(result.get('required_for', [])),
                        'Output_From': ', '.join(result.get('output_from', []))
                    })
        
        return pd.DataFrame(all_results)
    
    def create_summary_report(self, df_results: pd.DataFrame) -> Dict:
        """Create a summary report of the check results."""
        config_check = self.check_sensitivity_configuration()
        
        summary = {
            'check_timestamp': datetime.now().isoformat(),
            'job_output_dir': str(self.job_output_dir),
            'configuration_detected': config_check,
            'total_files_expected': len(self.expected_structure),
            'files_found': len(df_results[df_results['File_Exists'] == '✓']['Path'].unique()),
            'files_missing': len(df_results[df_results['File_Exists'] == '✗']['Path'].unique()),
            'total_expected_columns': len(df_results[df_results['Column_Type'] == 'Expected']),
            'expected_columns_found': len(df_results[(df_results['Column_Type'] == 'Expected') & 
                                                    (df_results['Column_Exists'] == '✓')]),
            'total_discovered_columns': len(df_results[df_results['Column_Type'] == 'DISCOVERED']),
            'total_issues': len(df_results[df_results['Status'].isin(['ISSUE', 'MISSING FILE'])]),
            'missing_files': [],
            'column_issues': [],
            'type_mismatches': [],
            'pipeline_readiness': {
                'modification_based': {
                    'has_modifications': False,
                    'has_base_results': False,
                    'has_modified_results': False,
                    'has_output_files': False,
                    'ready': False
                },
                'traditional': {
                    'has_parameters': False,
                    'has_base_results': False,
                    'has_output_files': False,
                    'ready': False
                },
                'multi_level': {
                    'has_zone_mappings': False,
                    'has_zone_results': False,
                    'has_equipment_assignments': False,
                    'ready': False
                },
                'time_slicing': {
                    'has_hourly_data': False,
                    'has_time_slice_outputs': False,
                    'ready': False
                },
                'advanced': {
                    'uncertainty': False,
                    'threshold': False,
                    'regional': False,
                    'sobol': False,
                    'temporal': False,
                    'any_ready': False
                }
            }
        }
        
        # Get missing files
        missing_files = df_results[df_results['File_Exists'] == '✗']['Path'].unique()
        summary['missing_files'] = list(missing_files)
        
        # Get column issues
        column_issues = df_results[(df_results['Column_Exists'] == '✗') & 
                                 (df_results['File_Exists'] == '✓') & 
                                 (df_results['Column_Type'] == 'Expected')]
        for _, row in column_issues.iterrows():
            summary['column_issues'].append(f"{row['File']}: {row['Column']}")
        
        # Get type mismatches
        type_issues = df_results[(df_results['Type_Match'] == '✗') & 
                               (df_results['Column_Exists'] == '✓') & 
                               (df_results['Column_Type'] == 'Expected')]
        for _, row in type_issues.iterrows():
            summary['type_mismatches'].append(
                f"{row['File']}: {row['Column']} (expected {row['Expected_Type']}, got {row['Actual_Type']})"
            )
        
        # Check pipeline readiness
        files_exist = df_results[df_results['File_Exists'] == '✓']['Path'].unique()
        
        # Modification-based readiness
        summary['pipeline_readiness']['modification_based']['has_modifications'] = any('modifications_detail' in f for f in files_exist)
        summary['pipeline_readiness']['modification_based']['has_base_results'] = any('parsed_data/sql_results' in f for f in files_exist)
        summary['pipeline_readiness']['modification_based']['has_modified_results'] = any('parsed_modified_results/sql_results' in f for f in files_exist)
        summary['pipeline_readiness']['modification_based']['has_output_files'] = any('modification_sensitivity_results' in f for f in files_exist)
        summary['pipeline_readiness']['modification_based']['ready'] = all([
            summary['pipeline_readiness']['modification_based']['has_modifications'],
            summary['pipeline_readiness']['modification_based']['has_base_results'],
            summary['pipeline_readiness']['modification_based']['has_modified_results']
        ])
        
        # Traditional readiness
        summary['pipeline_readiness']['traditional']['has_parameters'] = any('idf_data/by_category' in f or 'parameter_matrix' in f for f in files_exist)
        summary['pipeline_readiness']['traditional']['has_base_results'] = summary['pipeline_readiness']['modification_based']['has_base_results']
        summary['pipeline_readiness']['traditional']['has_output_files'] = any('traditional_sensitivity_results' in f for f in files_exist)
        summary['pipeline_readiness']['traditional']['ready'] = all([
            summary['pipeline_readiness']['traditional']['has_parameters'],
            summary['pipeline_readiness']['traditional']['has_base_results']
        ])
        
        # Multi-level readiness
        summary['pipeline_readiness']['multi_level']['has_zone_mappings'] = any('zone_mappings' in f for f in files_exist)
        summary['pipeline_readiness']['multi_level']['has_zone_results'] = any('zones_daily' in f for f in files_exist)
        summary['pipeline_readiness']['multi_level']['has_equipment_assignments'] = any('equipment_assignments' in f for f in files_exist)
        summary['pipeline_readiness']['multi_level']['ready'] = all([
            summary['pipeline_readiness']['multi_level']['has_zone_mappings'],
            summary['pipeline_readiness']['multi_level']['has_zone_results']
        ])
        
        # Time slicing readiness
        summary['pipeline_readiness']['time_slicing']['has_hourly_data'] = any('hourly' in f for f in files_exist)
        summary['pipeline_readiness']['time_slicing']['has_time_slice_outputs'] = any('time_slice' in f or '_sensitivity_results_' in f for f in files_exist)
        summary['pipeline_readiness']['time_slicing']['ready'] = summary['pipeline_readiness']['time_slicing']['has_hourly_data']
        
        # Advanced analyses
        for analysis in ['uncertainty', 'threshold', 'regional', 'sobol', 'temporal']:
            summary['pipeline_readiness']['advanced'][analysis] = any(f'{analysis}_' in f for f in files_exist)
        summary['pipeline_readiness']['advanced']['any_ready'] = any(
            summary['pipeline_readiness']['advanced'][a] for a in ['uncertainty', 'threshold', 'regional', 'sobol', 'temporal']
        )
        
        return summary
    
    def create_markdown_report(self, output_path: Path, df_results: pd.DataFrame, summary: Dict):
        """Create a markdown report for easy reading."""
        with open(output_path, 'w') as f:
            f.write(f"# Sensitivity Analysis Data Structure Check Report\n\n")
            f.write(f"**Generated:** {summary['check_timestamp']}\n\n")
            f.write(f"**Directory:** `{summary['job_output_dir']}`\n\n")
            
            # Configuration detected
            f.write("## Configuration Detected\n\n")
            config = summary['configuration_detected']
            f.write(f"- **Analysis Types:** {', '.join(config['analysis_types']) if config['analysis_types'] else 'None detected'}\n")
            f.write(f"- **Multi-level Analysis:** {'✓' if config['multi_level'] else '✗'}\n")
            f.write(f"- **Time Slicing:** {'✓' if config['time_slicing'] else '✗'}\n")
            f.write(f"- **Advanced Analyses:**\n")
            for analysis, enabled in config['advanced_analysis'].items():
                f.write(f"  - {analysis.capitalize()}: {'✓' if enabled else '✗'}\n")
            f.write("\n")
            
            # Summary statistics
            f.write("## Summary\n\n")
            f.write(f"- **Files Expected:** {summary['total_files_expected']}\n")
            f.write(f"- **Files Found:** {summary['files_found']}\n")
            f.write(f"- **Files Missing:** {summary['files_missing']}\n")
            f.write(f"- **Expected Columns:** {summary['total_expected_columns']}\n")
            f.write(f"- **Expected Columns Found:** {summary['expected_columns_found']}\n")
            f.write(f"- **Extra Columns Discovered:** {summary['total_discovered_columns']}\n")
            f.write(f"- **Total Issues:** {summary['total_issues']}\n\n")
            
            # Pipeline readiness
            f.write("## Pipeline Readiness\n\n")
            
            f.write("### Modification-Based Analysis\n")
            mod_ready = summary['pipeline_readiness']['modification_based']
            f.write(f"- **Modifications:** {'✓' if mod_ready['has_modifications'] else '✗'}\n")
            f.write(f"- **Base Results:** {'✓' if mod_ready['has_base_results'] else '✗'}\n")
            f.write(f"- **Modified Results:** {'✓' if mod_ready['has_modified_results'] else '✗'}\n")
            f.write(f"- **Output Files:** {'✓' if mod_ready['has_output_files'] else '✗'}\n")
            f.write(f"- **Ready:** {'✓ YES' if mod_ready['ready'] else '✗ NO'}\n\n")
            
            f.write("### Traditional Analysis\n")
            trad_ready = summary['pipeline_readiness']['traditional']
            f.write(f"- **Parameters:** {'✓' if trad_ready['has_parameters'] else '✗'}\n")
            f.write(f"- **Base Results:** {'✓' if trad_ready['has_base_results'] else '✗'}\n")
            f.write(f"- **Output Files:** {'✓' if trad_ready['has_output_files'] else '✗'}\n")
            f.write(f"- **Ready:** {'✓ YES' if trad_ready['ready'] else '✗ NO'}\n\n")
            
            f.write("### Multi-Level Analysis\n")
            multi_ready = summary['pipeline_readiness']['multi_level']
            f.write(f"- **Zone Mappings:** {'✓' if multi_ready['has_zone_mappings'] else '✗'}\n")
            f.write(f"- **Zone Results:** {'✓' if multi_ready['has_zone_results'] else '✗'}\n")
            f.write(f"- **Equipment Assignments:** {'✓' if multi_ready['has_equipment_assignments'] else '✗'}\n")
            f.write(f"- **Ready:** {'✓ YES' if multi_ready['ready'] else '✗ NO'}\n\n")
            
            if summary['missing_files']:
                f.write("## Missing Files\n\n")
                for file in summary['missing_files']:
                    f.write(f"- `{file}`\n")
                f.write("\n")
    
    def save_results(self, df_results: pd.DataFrame, summary: Dict):
        """Save all results to files."""
        output_dir = self.job_output_dir / 'sensitivity_data_check'
        output_dir.mkdir(exist_ok=True)
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save detailed results
        df_results.to_csv(output_dir / f'structure_check_details_{timestamp}.csv', index=False)
        df_results.to_excel(output_dir / f'structure_check_details_{timestamp}.xlsx', index=False)
        
        # Save summary
        with open(output_dir / f'structure_check_summary_{timestamp}.json', 'w') as f:
            json.dump(summary, f, indent=2)
        
        # Create markdown report
        self.create_markdown_report(output_dir / f'structure_report_{timestamp}.md', df_results, summary)
        
        # Save issues only
        df_issues = df_results[df_results['Status'].isin(['ISSUE', 'MISSING FILE'])]
        if not df_issues.empty:
            df_issues.to_csv(output_dir / f'issues_only_{timestamp}.csv', index=False)
        
        print(f"\nResults saved to: {output_dir}")
        print(f"Files created:")
        print(f"  - structure_check_details_{timestamp}.csv/xlsx")
        print(f"  - structure_check_summary_{timestamp}.json")
        print(f"  - structure_report_{timestamp}.md")
        if not df_issues.empty:
            print(f"  - issues_only_{timestamp}.csv")

# Usage function
def check_sensitivity_data_structure(job_output_dir: str):
    """Main function to check sensitivity data structure."""
    checker = SensitivityDataChecker(job_output_dir)
    
    # Run checks
    df_results = checker.check_all_files()
    summary = checker.create_summary_report(df_results)
    
    # Print summary
    print("\n" + "="*80)
    print("CONFIGURATION DETECTED")
    print("="*80)
    config = summary['configuration_detected']
    print(f"Analysis Types: {', '.join(config['analysis_types']) if config['analysis_types'] else 'None'}")
    print(f"Multi-level: {'✓' if config['multi_level'] else '✗'}")
    print(f"Time Slicing: {'✓' if config['time_slicing'] else '✗'}")
    print(f"Advanced Analyses: {sum(config['advanced_analysis'].values())} enabled")
    
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    print(f"Total files expected: {summary['total_files_expected']}")
    print(f"Files found: {summary['files_found']}")
    print(f"Files missing: {summary['files_missing']}")
    print(f"Total issues: {summary['total_issues']}")
    
    print("\nPIPELINE READINESS:")
    for pipeline, status in summary['pipeline_readiness'].items():
        if isinstance(status, dict) and 'ready' in status:
            print(f"  - {pipeline}: {'✓ READY' if status['ready'] else '✗ NOT READY'}")
    
    # Save results
    checker.save_results(df_results, summary)
    
    return df_results, summary

# Example usage
if __name__ == "__main__":
    # Replace with your actual job directory
    job_dir = r"D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff"
    df_results, summary = check_sensitivity_data_structure(job_dir)

Checking sensitivity data structure in: D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff

Checking: modified_idfs/modifications_detail_*.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/hvac_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/energy_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/electricity_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/temperature_daily.parquet
Checking: parsed_data/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet
Checking: parsed_data/sql_results/timeseries/hourly/hvac_2013.parquet
Checking: parsed_modified_results/sql_results/timeseries/aggregated/daily/hvac_daily.parquet
Checking: parsed_modified_results/sql_results/timeseries/aggregated/daily/energy_daily.parquet
Checking: parsed_modified_results/sql_results/timeseries/a

## check v2

In [6]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
import json
from datetime import datetime
import glob
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

class EnhancedSensitivityDataChecker:
    def __init__(self, job_output_dir: str):
        self.job_output_dir = Path(job_output_dir)
        self.results = []
        self.discovered_columns = {}
        
        # Color codes for terminal output
        self.colors = {
            'green': '\033[92m',
            'red': '\033[91m',
            'yellow': '\033[93m',
            'blue': '\033[94m',
            'purple': '\033[95m',
            'cyan': '\033[96m',
            'bold': '\033[1m',
            'underline': '\033[4m',
            'end': '\033[0m'
        }
        
        # Categorized structure for better organization
        self.file_categories = {
            'INPUT_DATA': {
                'name': 'Input Data Files',
                'subcategories': {
                    'MODIFICATIONS': {
                        'name': 'Modification Tracking',
                        'files': ['modified_idfs/modifications_detail_*.parquet']
                    },
                    'BASE_RESULTS': {
                        'name': 'Base Simulation Results',
                        'files': [
                            'parsed_data/sql_results/timeseries/aggregated/daily/hvac_daily.parquet',
                            'parsed_data/sql_results/timeseries/aggregated/daily/energy_daily.parquet',
                            'parsed_data/sql_results/timeseries/aggregated/daily/electricity_daily.parquet',
                            'parsed_data/sql_results/timeseries/aggregated/daily/zones_daily.parquet',
                            'parsed_data/sql_results/timeseries/aggregated/daily/temperature_daily.parquet',
                            'parsed_data/sql_results/timeseries/aggregated/daily/ventilation_daily.parquet'
                        ]
                    },
                    'MODIFIED_RESULTS': {
                        'name': 'Modified Simulation Results',
                        'files': [
                            'parsed_modified_results/sql_results/timeseries/aggregated/daily/hvac_daily.parquet',
                            'parsed_modified_results/sql_results/timeseries/aggregated/daily/energy_daily.parquet',
                            'parsed_modified_results/sql_results/timeseries/aggregated/daily/electricity_daily.parquet',
                            'parsed_modified_results/sql_results/timeseries/aggregated/daily/zones_daily.parquet'
                        ]
                    },
                    'RELATIONSHIPS': {
                        'name': 'Zone/Equipment Relationships',
                        'files': [
                            'parsed_data/relationships/zone_mappings.parquet',
                            'parsed_data/relationships/equipment_assignments.parquet'
                        ]
                    },
                    'PARAMETERS': {
                        'name': 'Traditional Analysis Parameters',
                        'files': [
                            'parsed_data/idf_data/by_category/*.parquet',
                            'parsed_data/analysis_ready/parameter_matrix.parquet'
                        ]
                    },
                    'METADATA': {
                        'name': 'Building Metadata',
                        'files': ['parsed_data/metadata/building_registry.parquet']
                    }
                }
            },
            'OUTPUT_DATA': {
                'name': 'Sensitivity Analysis Outputs',
                'subcategories': {
                    'MAIN_RESULTS': {
                        'name': 'Main Sensitivity Results',
                        'files': [
                            'sensitivity_results/modification_sensitivity_results.parquet',
                            'sensitivity_results/traditional_sensitivity_results.parquet',
                            'sensitivity_results/hybrid_sensitivity_results.parquet'
                        ]
                    },
                    'TIME_SLICE_RESULTS': {
                        'name': 'Time Slice Results',
                        'files': [
                            'sensitivity_results/parameter_stability_across_time_slices.parquet',
                            'sensitivity_results/*_sensitivity_results_*.parquet'
                        ]
                    },
                    'ADVANCED_RESULTS': {
                        'name': 'Advanced Analysis Results',
                        'files': [
                            'sensitivity_results/uncertainty_analysis_results.parquet',
                            'sensitivity_results/threshold_analysis_results.parquet',
                            'sensitivity_results/regional_sensitivity_results.parquet',
                            'sensitivity_results/sobol_analysis_results.parquet',
                            'sensitivity_results/temporal_pattern_results.parquet'
                        ]
                    },
                    'EXPORTS': {
                        'name': 'Export Files',
                        'files': [
                            'sensitivity_results/top_sensitive_parameters.csv',
                            'sensitivity_results/calibration_parameters.json',
                            'sensitivity_results/sensitive_parameters_for_surrogate.json'
                        ]
                    },
                    'REPORTS': {
                        'name': 'Analysis Reports',
                        'files': [
                            'sensitivity_results/modification_sensitivity_report.json',
                            'sensitivity_results/sensitivity_summary.json',
                            'sensitivity_results/time_slice_sensitivity_summary.json',
                            'sensitivity_results/advanced_sensitivity_report.json'
                        ]
                    }
                }
            }
        }
        
        # Expected columns structure (simplified for reference)
        self.expected_columns = {
            'modifications': {
                'building_id': ['str', 'int'],
                'category': 'str',
                'object_type': 'str',
                'object_name': 'str',
                'field_name': 'str',
                'param_key': 'str',
                'param_delta': 'float',
                'param_pct_change': 'float'
            },
            'simulation_results': {
                'building_id': ['str', 'int'],
                'Variable': 'str',
                'Value': 'float',
                'DateTime': 'datetime',
                'Zone': 'str'
            },
            'sensitivity_results': {
                'parameter': 'str',
                'output_variable': 'str',
                'sensitivity_score': 'float',
                'method': 'str',
                'p_value': 'float',
                'category': 'str'
            }
        }
    
    def colored_text(self, text: str, color: str = 'end', bold: bool = False) -> str:
        """Return colored text for terminal output."""
        color_code = self.colors.get(color, '')
        bold_code = self.colors['bold'] if bold else ''
        return f"{bold_code}{color_code}{text}{self.colors['end']}"
    
    def check_file_exists(self, file_path: str) -> Tuple[bool, Optional[str], int]:
        """Check if file exists and return path and size."""
        full_path = self.job_output_dir / file_path
        
        if '*' in file_path:
            parent = full_path.parent
            pattern = full_path.name
            if parent.exists():
                matching_files = list(parent.glob(pattern))
                if matching_files:
                    file_size = sum(f.stat().st_size for f in matching_files)
                    return True, str(matching_files[0]), file_size
            return False, None, 0
        
        if full_path.exists():
            file_size = full_path.stat().st_size
            return True, str(full_path), file_size
        return False, None, 0
    
    def get_file_info(self, file_path: str) -> Dict:
        """Get comprehensive file information."""
        exists, abs_path, file_size = self.check_file_exists(file_path)
        
        info = {
            'path': file_path,
            'exists': exists,
            'abs_path': abs_path,
            'size_bytes': file_size,
            'size_mb': round(file_size / (1024 * 1024), 2) if file_size > 0 else 0
        }
        
        if exists and abs_path:
            if file_path.endswith('.parquet'):
                try:
                    df = pd.read_parquet(abs_path)
                    info.update({
                        'type': 'parquet',
                        'rows': len(df),
                        'columns': list(df.columns),
                        'column_count': len(df.columns),
                        'memory_usage_mb': round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2),
                        'dtypes': {col: str(df[col].dtype) for col in df.columns}
                    })
                    
                    # Sample data
                    if len(df) > 0:
                        info['sample_data'] = df.head(3).to_dict('records')
                    
                    # Key column analysis
                    if 'sensitivity_score' in df.columns and len(df) > 0:
                        info['sensitivity_stats'] = {
                            'min': float(df['sensitivity_score'].min()),
                            'max': float(df['sensitivity_score'].max()),
                            'mean': float(df['sensitivity_score'].mean()),
                            'std': float(df['sensitivity_score'].std()),
                            'top_5_scores': df.nlargest(5, 'sensitivity_score')['sensitivity_score'].tolist()
                        }
                    
                    if 'parameter' in df.columns:
                        info['n_unique_parameters'] = df['parameter'].nunique()
                        info['top_5_parameters'] = df['parameter'].value_counts().head(5).to_dict()
                    
                except Exception as e:
                    info['error'] = str(e)
                    
            elif file_path.endswith('.json'):
                try:
                    with open(abs_path, 'r') as f:
                        content = json.load(f)
                    info.update({
                        'type': 'json',
                        'keys': list(content.keys()) if isinstance(content, dict) else ['list_content'],
                        'n_keys': len(content) if isinstance(content, dict) else 1
                    })
                except Exception as e:
                    info['error'] = str(e)
        
        return info
    
    def create_visual_summary(self) -> Dict:
        """Create a visual summary of the data structure."""
        summary = {
            'overview': {
                'total_categories': 0,
                'total_subcategories': 0,
                'total_files_expected': 0,
                'total_files_found': 0,
                'total_size_mb': 0,
                'total_rows': 0
            },
            'by_category': {},
            'missing_critical': [],
            'extra_files': [],
            'issues': []
        }
        
        # Check each category
        for cat_key, category in self.file_categories.items():
            cat_summary = {
                'name': category['name'],
                'subcategories': {},
                'total_expected': 0,
                'total_found': 0,
                'completeness': 0
            }
            
            for subcat_key, subcategory in category['subcategories'].items():
                subcat_summary = {
                    'name': subcategory['name'],
                    'files': [],
                    'expected': len(subcategory['files']),
                    'found': 0,
                    'missing': 0,
                    'total_size_mb': 0,
                    'total_rows': 0
                }
                
                for file_path in subcategory['files']:
                    file_info = self.get_file_info(file_path)
                    
                    file_summary = {
                        'path': file_path,
                        'exists': file_info['exists'],
                        'size_mb': file_info.get('size_mb', 0),
                        'rows': file_info.get('rows', 0),
                        'columns': file_info.get('column_count', 0)
                    }
                    
                    if file_info['exists']:
                        subcat_summary['found'] += 1
                        subcat_summary['total_size_mb'] += file_info.get('size_mb', 0)
                        subcat_summary['total_rows'] += file_info.get('rows', 0)
                        summary['overview']['total_files_found'] += 1
                        summary['overview']['total_size_mb'] += file_info.get('size_mb', 0)
                        summary['overview']['total_rows'] += file_info.get('rows', 0)
                    else:
                        subcat_summary['missing'] += 1
                        # Check if it's critical
                        if any(critical in file_path for critical in ['modifications_detail', 'hvac_daily', 'energy_daily']):
                            summary['missing_critical'].append(file_path)
                    
                    subcat_summary['files'].append(file_summary)
                
                subcat_summary['completeness'] = (subcat_summary['found'] / subcat_summary['expected'] * 100) if subcat_summary['expected'] > 0 else 0
                cat_summary['subcategories'][subcat_key] = subcat_summary
                cat_summary['total_expected'] += subcat_summary['expected']
                cat_summary['total_found'] += subcat_summary['found']
                
                summary['overview']['total_files_expected'] += subcat_summary['expected']
            
            cat_summary['completeness'] = (cat_summary['total_found'] / cat_summary['total_expected'] * 100) if cat_summary['total_expected'] > 0 else 0
            summary['by_category'][cat_key] = cat_summary
            summary['overview']['total_categories'] += 1
            summary['overview']['total_subcategories'] += len(category['subcategories'])
        
        summary['overview']['overall_completeness'] = (summary['overview']['total_files_found'] / summary['overview']['total_files_expected'] * 100) if summary['overview']['total_files_expected'] > 0 else 0
        
        return summary
    
    def print_visual_dashboard(self, summary: Dict):
        """Print a visual dashboard to the console."""
        print("\n" + "="*100)
        print(self.colored_text("SENSITIVITY DATA STRUCTURE DASHBOARD", 'cyan', bold=True).center(100))
        print("="*100)
        
        # Overview section
        overview = summary['overview']
        print(f"\n{self.colored_text('📊 OVERVIEW', 'blue', bold=True)}")
        print(f"  Total Files Expected: {overview['total_files_expected']}")
        print(f"  Total Files Found: {self.colored_text(str(overview['total_files_found']), 'green' if overview['total_files_found'] > 0 else 'red')}")
        print(f"  Overall Completeness: {self._get_completeness_bar(overview['overall_completeness'])}")
        print(f"  Total Data Size: {overview['total_size_mb']:.2f} MB")
        print(f"  Total Data Rows: {overview['total_rows']:,}")
        
        # Category breakdown
        print(f"\n{self.colored_text('📁 CATEGORY BREAKDOWN', 'blue', bold=True)}")
        
        for cat_key, cat_data in summary['by_category'].items():
            print(f"\n  {self.colored_text(cat_data['name'], 'yellow', bold=True)}")
            print(f"  {self._get_completeness_bar(cat_data['completeness'])}")
            print(f"  Files: {cat_data['total_found']}/{cat_data['total_expected']}")
            
            # Subcategories
            for subcat_key, subcat_data in cat_data['subcategories'].items():
                status_icon = "✅" if subcat_data['completeness'] == 100 else "⚠️" if subcat_data['completeness'] > 0 else "❌"
                print(f"\n    {status_icon} {subcat_data['name']}")
                print(f"       Files: {subcat_data['found']}/{subcat_data['expected']} | Size: {subcat_data['total_size_mb']:.2f} MB | Rows: {subcat_data['total_rows']:,}")
                
                # Show missing files
                for file_info in subcat_data['files']:
                    if not file_info['exists']:
                        print(f"       {self.colored_text('❌ Missing:', 'red')} {file_info['path'].split('/')[-1]}")
        
        # Critical missing files
        if summary['missing_critical']:
            print(f"\n{self.colored_text('🚨 CRITICAL MISSING FILES', 'red', bold=True)}")
            for file in summary['missing_critical']:
                print(f"  - {file}")
        
        print("\n" + "="*100)
    
    def _get_completeness_bar(self, percentage: float, width: int = 20) -> str:
        """Create a visual progress bar."""
        filled = int(width * percentage / 100)
        bar = "█" * filled + "░" * (width - filled)
        
        if percentage >= 80:
            color = 'green'
        elif percentage >= 50:
            color = 'yellow'
        else:
            color = 'red'
        
        return f"{self.colored_text(bar, color)} {percentage:.1f}%"
    
    def create_detailed_column_report(self) -> pd.DataFrame:
        """Create a detailed report of all columns across all files."""
        column_data = []
        
        for cat_key, category in self.file_categories.items():
            for subcat_key, subcategory in category['subcategories'].items():
                for file_path in subcategory['files']:
                    file_info = self.get_file_info(file_path)
                    
                    if file_info.get('exists') and file_info.get('type') == 'parquet':
                        for col in file_info.get('columns', []):
                            column_data.append({
                                'Category': category['name'],
                                'Subcategory': subcategory['name'],
                                'File': file_path.split('/')[-1],
                                'Column': col,
                                'Type': file_info['dtypes'].get(col, 'unknown'),
                                'Status': self._check_column_status(file_path, col)
                            })
        
        return pd.DataFrame(column_data)
    
    def _check_column_status(self, file_path: str, column: str) -> str:
        """Check if a column is expected, extra, or missing."""
        # Simplified logic - would need full expected columns mapping
        if 'sensitivity_score' in column or 'parameter' in column:
            return 'Expected'
        elif column in ['building_id', 'Variable', 'Value', 'DateTime', 'Zone']:
            return 'Expected'
        else:
            return 'Extra'
    
    def create_html_report(self, summary: Dict, output_path: Path):
        """Create an interactive HTML report."""
        html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Sensitivity Data Structure Report</title>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            margin: 0;
            padding: 20px;
            background-color: #f5f5f5;
        }
        .container {
            max-width: 1400px;
            margin: 0 auto;
            background-color: white;
            padding: 30px;
            box-shadow: 0 0 20px rgba(0,0,0,0.1);
            border-radius: 10px;
        }
        h1 {
            color: #2c3e50;
            text-align: center;
            margin-bottom: 30px;
        }
        .overview-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
            margin-bottom: 40px;
        }
        .metric-card {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 20px;
            border-radius: 10px;
            text-align: center;
            box-shadow: 0 5px 15px rgba(0,0,0,0.1);
        }
        .metric-value {
            font-size: 2.5em;
            font-weight: bold;
            margin: 10px 0;
        }
        .metric-label {
            font-size: 0.9em;
            opacity: 0.9;
        }
        .category-section {
            margin-bottom: 30px;
            background: #f8f9fa;
            padding: 20px;
            border-radius: 8px;
            border-left: 4px solid #667eea;
        }
        .category-header {
            font-size: 1.3em;
            font-weight: bold;
            color: #2c3e50;
            margin-bottom: 15px;
        }
        .subcategory {
            background: white;
            padding: 15px;
            margin: 10px 0;
            border-radius: 5px;
            box-shadow: 0 2px 5px rgba(0,0,0,0.05);
        }
        .progress-bar {
            width: 100%;
            height: 20px;
            background: #e0e0e0;
            border-radius: 10px;
            overflow: hidden;
            margin: 10px 0;
        }
        .progress-fill {
            height: 100%;
            background: linear-gradient(90deg, #4CAF50 0%, #45a049 100%);
            transition: width 0.3s ease;
        }
        .file-status {
            display: inline-block;
            padding: 3px 8px;
            border-radius: 3px;
            font-size: 0.85em;
            margin: 2px;
        }
        .status-found {
            background: #4CAF50;
            color: white;
        }
        .status-missing {
            background: #f44336;
            color: white;
        }
        .file-grid {
            display: grid;
            grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
            gap: 10px;
            margin-top: 10px;
        }
        .file-card {
            background: #f5f5f5;
            padding: 10px;
            border-radius: 5px;
            font-size: 0.9em;
        }
        .critical-missing {
            background: #ffebee;
            border-left: 4px solid #f44336;
            padding: 20px;
            margin: 20px 0;
            border-radius: 5px;
        }
        .legend {
            display: flex;
            justify-content: center;
            gap: 20px;
            margin: 20px 0;
            flex-wrap: wrap;
        }
        .legend-item {
            display: flex;
            align-items: center;
            gap: 5px;
        }
        .legend-color {
            width: 20px;
            height: 20px;
            border-radius: 3px;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>🔍 Sensitivity Data Structure Analysis</h1>
        <p style="text-align: center; color: #666;">Generated: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """</p>
        
        <div class="overview-grid">
            <div class="metric-card">
                <div class="metric-label">Total Files Expected</div>
                <div class="metric-value">""" + str(summary['overview']['total_files_expected']) + """</div>
            </div>
            <div class="metric-card" style="background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%);">
                <div class="metric-label">Files Found</div>
                <div class="metric-value">""" + str(summary['overview']['total_files_found']) + """</div>
            </div>
            <div class="metric-card" style="background: linear-gradient(135deg, #ff9800 0%, #ff5722 100%);">
                <div class="metric-label">Overall Completeness</div>
                <div class="metric-value">""" + f"{summary['overview']['overall_completeness']:.1f}%" + """</div>
            </div>
            <div class="metric-card" style="background: linear-gradient(135deg, #2196F3 0%, #1976D2 100%);">
                <div class="metric-label">Total Data Size</div>
                <div class="metric-value">""" + f"{summary['overview']['total_size_mb']:.1f} MB" + """</div>
            </div>
        </div>
        
        <div class="legend">
            <div class="legend-item">
                <div class="legend-color" style="background: #4CAF50;"></div>
                <span>Found</span>
            </div>
            <div class="legend-item">
                <div class="legend-color" style="background: #f44336;"></div>
                <span>Missing</span>
            </div>
            <div class="legend-item">
                <div class="legend-color" style="background: #ff9800;"></div>
                <span>Partial</span>
            </div>
        </div>
"""
        
        # Add category sections
        for cat_key, cat_data in summary['by_category'].items():
            html_content += f"""
        <div class="category-section">
            <div class="category-header">{cat_data['name']}</div>
            <div class="progress-bar">
                <div class="progress-fill" style="width: {cat_data['completeness']}%;"></div>
            </div>
            <p>Files: {cat_data['total_found']}/{cat_data['total_expected']} ({cat_data['completeness']:.1f}% complete)</p>
"""
            
            for subcat_key, subcat_data in cat_data['subcategories'].items():
                status_color = '#4CAF50' if subcat_data['completeness'] == 100 else '#ff9800' if subcat_data['completeness'] > 0 else '#f44336'
                html_content += f"""
            <div class="subcategory">
                <h4 style="color: {status_color};">{subcat_data['name']}</h4>
                <div class="progress-bar">
                    <div class="progress-fill" style="width: {subcat_data['completeness']}%; background: {status_color};"></div>
                </div>
                <p>Files: {subcat_data['found']}/{subcat_data['expected']} | Size: {subcat_data['total_size_mb']:.2f} MB | Rows: {subcat_data['total_rows']:,}</p>
                <div class="file-grid">
"""
                
                for file_info in subcat_data['files']:
                    status = 'found' if file_info['exists'] else 'missing'
                    html_content += f"""
                    <div class="file-card">
                        <span class="file-status status-{status}">{'✓' if file_info['exists'] else '✗'}</span>
                        <strong>{file_info['path'].split('/')[-1]}</strong>
                        {f"<br>Size: {file_info['size_mb']:.2f} MB | Rows: {file_info['rows']:,}" if file_info['exists'] else ""}
                    </div>
"""
                
                html_content += """
                </div>
            </div>
"""
            
            html_content += """
        </div>
"""
        
        # Add critical missing files section if any
        if summary['missing_critical']:
            html_content += """
        <div class="critical-missing">
            <h3 style="color: #f44336; margin-top: 0;">🚨 Critical Missing Files</h3>
            <ul>
"""
            for file in summary['missing_critical']:
                html_content += f"            <li>{file}</li>\n"
            
            html_content += """
            </ul>
        </div>
"""
        
        html_content += """
    </div>
</body>
</html>
"""
        
        with open(output_path, 'w') as f:
            f.write(html_content)
    
    def create_comparison_matrix(self) -> pd.DataFrame:
        """Create a comparison matrix showing file availability across categories."""
        matrix_data = []
        
        for cat_key, category in self.file_categories.items():
            for subcat_key, subcategory in category['subcategories'].items():
                for file_path in subcategory['files']:
                    file_info = self.get_file_info(file_path)
                    
                    matrix_data.append({
                        'Category': category['name'],
                        'Subcategory': subcategory['name'],
                        'File': file_path.split('/')[-1],
                        'Status': '✅' if file_info['exists'] else '❌',
                        'Size (MB)': file_info.get('size_mb', 0) if file_info['exists'] else 'N/A',
                        'Rows': f"{file_info.get('rows', 0):,}" if file_info.get('rows') else 'N/A',
                        'Columns': file_info.get('column_count', 'N/A') if file_info.get('column_count') else 'N/A'
                    })
        
        return pd.DataFrame(matrix_data)
    
    def save_enhanced_results(self, summary: Dict, output_dir: Path):
        """Save enhanced results with multiple views."""
        output_dir.mkdir(exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # 1. Save summary JSON
        with open(output_dir / f'sensitivity_structure_summary_{timestamp}.json', 'w') as f:
            json.dump(summary, f, indent=2, default=str)
        
        # 2. Create and save comparison matrix
        matrix_df = self.create_comparison_matrix()
        matrix_df.to_csv(output_dir / f'file_comparison_matrix_{timestamp}.csv', index=False)
        matrix_df.to_excel(output_dir / f'file_comparison_matrix_{timestamp}.xlsx', index=False)
        
        # 3. Create and save detailed column report
        column_df = self.create_detailed_column_report()
        if not column_df.empty:
            column_df.to_csv(output_dir / f'column_analysis_{timestamp}.csv', index=False)
        
        # 4. Create HTML report
        self.create_html_report(summary, output_dir / f'visual_report_{timestamp}.html')
        
        # 5. Create missing files report
        missing_files = []
        for cat_key, cat_data in summary['by_category'].items():
            for subcat_key, subcat_data in cat_data['subcategories'].items():
                for file_info in subcat_data['files']:
                    if not file_info['exists']:
                        missing_files.append({
                            'Category': cat_data['name'],
                            'Subcategory': subcat_data['name'],
                            'File': file_info['path'],
                            'Priority': 'Critical' if any(critical in file_info['path'] for critical in ['modifications_detail', 'hvac_daily', 'energy_daily']) else 'Normal'
                        })
        
        if missing_files:
            pd.DataFrame(missing_files).to_csv(output_dir / f'missing_files_{timestamp}.csv', index=False)
        
        print(f"\n{self.colored_text('💾 Results saved to:', 'green', bold=True)} {output_dir}")
        print(f"  - Visual HTML Report: visual_report_{timestamp}.html")
        print(f"  - File Comparison Matrix: file_comparison_matrix_{timestamp}.csv")
        print(f"  - Structure Summary: sensitivity_structure_summary_{timestamp}.json")
        if missing_files:
            print(f"  - Missing Files List: missing_files_{timestamp}.csv")

# Enhanced usage function
def check_sensitivity_structure_enhanced(job_output_dir: str):
    """Enhanced sensitivity data structure checker with visual output."""
    checker = EnhancedSensitivityDataChecker(job_output_dir)
    
    # Create visual summary
    summary = checker.create_visual_summary()
    
    # Print visual dashboard
    checker.print_visual_dashboard(summary)
    
    # Detect configuration
    print(f"\n{checker.colored_text('🔧 DETECTED CONFIGURATION', 'purple', bold=True)}")
    
    # Analysis type detection
    analysis_types = []
    if any('modification' in str(cat) for cat in summary['by_category'].values()):
        if summary['by_category']['INPUT_DATA']['subcategories']['MODIFICATIONS']['found'] > 0:
            analysis_types.append('Modification-based')
    if summary['by_category']['INPUT_DATA']['subcategories']['PARAMETERS']['found'] > 0:
        analysis_types.append('Traditional')
    
    print(f"  Analysis Types: {', '.join(analysis_types) if analysis_types else 'None detected'}")
    
    # Feature detection
    features = []
    if summary['by_category']['INPUT_DATA']['subcategories']['RELATIONSHIPS']['found'] > 0:
        features.append('Multi-level Analysis')
    if any('time_slice' in str(f) for f in summary.get('extra_files', [])):
        features.append('Time Slicing')
    if summary['by_category']['OUTPUT_DATA']['subcategories']['ADVANCED_RESULTS']['found'] > 0:
        features.append('Advanced Analysis')
    
    print(f"  Features Enabled: {', '.join(features) if features else 'None'}")
    
    # Save results
    output_dir = Path(job_output_dir) / 'sensitivity_structure_check'
    checker.save_enhanced_results(summary, output_dir)
    
    return summary

# Example usage
if __name__ == "__main__":
    # Replace with your actual job directory
    job_dir = r"D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff"
    summary = check_sensitivity_structure_enhanced(job_dir)


                         [1m[96mSENSITIVITY DATA STRUCTURE DASHBOARD[0m                          

[1m[94m📊 OVERVIEW[0m
  Total Files Expected: 33
  Total Files Found: [92m14[0m
  Overall Completeness: [91m████████░░░░░░░░░░░░[0m 42.4%
  Total Data Size: 1.46 MB
  Total Data Rows: 320,510

[1m[94m📁 CATEGORY BREAKDOWN[0m

  [1m[93mInput Data Files[0m
  [93m████████████░░░░░░░░[0m 62.5%
  Files: 10/16

    ✅ Modification Tracking
       Files: 1/1 | Size: 0.03 MB | Rows: 893

    ⚠️ Base Simulation Results
       Files: 3/6 | Size: 0.11 MB | Rows: 213,060
       [91m❌ Missing:[0m energy_daily.parquet
       [91m❌ Missing:[0m electricity_daily.parquet
       [91m❌ Missing:[0m temperature_daily.parquet

    ⚠️ Modified Simulation Results
       Files: 2/4 | Size: 0.07 MB | Rows: 105,190
       [91m❌ Missing:[0m energy_daily.parquet
       [91m❌ Missing:[0m electricity_daily.parquet

    ✅ Zone/Equipment Relationships
       Files: 2/2 | Size: 0.00 MB | Rows: 89

# PArsing

In [9]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
import json
from datetime import datetime

class ParserDataChecker:
    def __init__(self, parsed_data_dir: str):
        self.parsed_data_dir = Path(parsed_data_dir)
        self.results = []
        self.discovered_columns = {}
        
        # Expected structure based on parser implementation
        self.expected_structure = {
            # ===== METADATA FILES =====
            'metadata/project_manifest.json': {
                'type': 'json',
                'fields': {
                    'project_id': 'str',
                    'created': 'str',
                    'total_buildings': 'int',
                    'categories_tracked': 'list',
                    'last_updated': 'str',
                    'data_version': 'str',
                    'file_structure': 'dict'
                }
            },
            'metadata/building_registry.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'ogc_fid': ['str', 'int'],
                    'idf_path': 'str',
                    'sql_path': 'str',
                    'zone_count': 'int',
                    'output_variables': 'int',
                    'output_meters': 'int',
                    'status': 'str',
                    'last_modified': 'datetime',
                    'variant_id': 'str'
                }
            },
            'metadata/category_schemas.json': {
                'type': 'json',
                'fields': {
                    'columns': 'list',
                    'dtypes': 'dict',
                    'row_count': 'int',
                    'building_count': 'int'
                }
            },
            'metadata/output_documentation.json': {
                'type': 'json',
                'fields': {
                    'project': 'str',
                    'timestamp': 'str',
                    'buildings': 'dict',
                    'summary': 'dict',
                    'output_categories': 'dict'
                }
            },
            
            # ===== IDF DATA BY CATEGORY =====
            'idf_data/by_category/simulation_control.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'variant_id': 'str',
                    'do_zone_sizing_calculation': 'str',
                    'do_system_sizing_calculation': 'str',
                    'do_plant_sizing_calculation': 'str',
                    'run_simulation_for_sizing_periods': 'str',
                    'run_simulation_for_weather_file_run_periods': 'str'
                }
            },
            'idf_data/by_category/site_location.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'latitude': ['str', 'float'],
                    'longitude': ['str', 'float'],
                    'time_zone': ['str', 'float'],
                    'elevation': ['str', 'float']
                }
            },
            'idf_data/by_category/geometry_zones.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'zone_name': 'str',
                    'volume': ['str', 'float'],
                    'floor_area': ['str', 'float'],
                    'ceiling_height': ['str', 'float'],
                    'multiplier': ['str', 'float'],
                    'volume_numeric': 'float',
                    'floor_area_numeric': 'float',
                    'ceiling_height_numeric': 'float'
                }
            },
            'idf_data/by_category/geometry_surfaces.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'zone_name': 'str',
                    'surface_type': 'str',
                    'construction_name': 'str',
                    'outside_boundary_condition': 'str',
                    'sun_exposure': 'str',
                    'wind_exposure': 'str',
                    'number_of_vertices': ['str', 'float']
                }
            },
            'idf_data/by_category/materials_constructions.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'outside_layer': 'str',
                    'layer_2': 'str',
                    'layer_3': 'str',
                    'layer_4': 'str',
                    'layer_5': 'str'
                }
            },
            'idf_data/by_category/materials_materials.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'roughness': 'str',
                    'thickness': ['str', 'float'],
                    'conductivity': ['str', 'float'],
                    'density': ['str', 'float'],
                    'specific_heat': ['str', 'float'],
                    'thickness_numeric': 'float',
                    'conductivity_numeric': 'float',
                    'density_numeric': 'float',
                    'specific_heat_numeric': 'float'
                }
            },
            'idf_data/by_category/hvac_equipment.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'zone_name': 'str',
                    'availability_schedule_name': 'str',
                    'maximum_heating_supply_air_temperature': ['str', 'float'],
                    'minimum_cooling_supply_air_temperature': ['str', 'float'],
                    'maximum_heating_air_flow_rate': ['str', 'float'],
                    'maximum_cooling_air_flow_rate': ['str', 'float']
                }
            },
            'idf_data/by_category/hvac_thermostats.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'heating_setpoint_temperature_schedule_name': 'str',
                    'cooling_setpoint_temperature_schedule_name': 'str'
                }
            },
            'idf_data/by_category/outputs_all.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'output_type': 'str',
                    'output_subtype': 'str',
                    'key_value': 'str',
                    'name': 'str',
                    'reporting_frequency': 'str',
                    'schedule_name': 'str'
                }
            },
            'idf_data/by_category/ventilation.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'zone_name': 'str',
                    'design_flow_rate': ['str', 'float'],
                    'flow_rate_per_zone_floor_area': ['str', 'float'],
                    'air_changes_per_hour': ['str', 'float'],
                    'schedule_name': 'str',
                    'design_flow_rate_numeric': 'float'
                }
            },
            'idf_data/by_category/infiltration.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'zone_name': 'str',
                    'design_flow_rate': ['str', 'float'],
                    'air_changes_per_hour': ['str', 'float'],
                    'constant_term_coefficient': ['str', 'float'],
                    'temperature_term_coefficient': ['str', 'float'],
                    'design_flow_rate_numeric': 'float'
                }
            },
            'idf_data/by_category/lighting.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'zone_name': 'str',
                    'lighting_level': ['str', 'float'],
                    'watts_per_zone_floor_area': ['str', 'float'],
                    'schedule_name': 'str',
                    'fraction_radiant': ['str', 'float'],
                    'fraction_visible': ['str', 'float'],
                    'lighting_level_numeric': 'float',
                    'watts_per_zone_floor_area_numeric': 'float'
                }
            },
            'idf_data/by_category/equipment.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'zone_name': 'str',
                    'design_level': ['str', 'float'],
                    'watts_per_zone_floor_area': ['str', 'float'],
                    'schedule_name': 'str',
                    'fraction_latent': ['str', 'float'],
                    'fraction_radiant': ['str', 'float'],
                    'design_level_numeric': 'float'
                }
            },
            'idf_data/by_category/dhw.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'tank_volume': ['str', 'float'],
                    'heater_maximum_capacity': ['str', 'float'],
                    'setpoint_temperature_schedule_name': 'str',
                    'heater_thermal_efficiency': ['str', 'float'],
                    'heater_fuel_type': 'str',
                    'tank_volume_numeric': 'float',
                    'heater_maximum_capacity_numeric': 'float'
                }
            },
            'idf_data/by_category/schedules.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'object_type': 'str',
                    'object_name': 'str',
                    'schedule_type_limits_name': 'str'
                }
            },
            
            # ===== SQL RESULTS =====
            'sql_results/timeseries/hourly/*_2020.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'DateTime': 'datetime',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float',
                    'Units': 'str',
                    'ReportingFrequency': 'str'
                }
            },
            'sql_results/timeseries/aggregated/daily/*_daily.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'DateTime': 'datetime',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'sql_results/timeseries/aggregated/monthly/*_monthly.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'DateTime': 'datetime',
                    'Zone': 'str',
                    'Variable': 'str',
                    'Value': 'float'
                }
            },
            'sql_results/schedules/all_schedules.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'ScheduleIndex': 'int',
                    'ScheduleName': 'str',
                    'ScheduleType': 'str',
                    'ScheduleMinimum': 'float',
                    'ScheduleMaximum': 'float'
                }
            },
            'sql_results/summary_metrics/building_metrics.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'sql_file': 'str',
                    'extraction_date': 'str',
                    'zone_count': 'int',
                    'total_floor_area': 'float',
                    'total_volume': 'float',
                    'energyplus_version': 'str',
                    'simulation_timestamp': 'str',
                    'timesteps_per_hour': 'int',
                    'total_output_variables': 'int',
                    'outputs_with_data': 'int',
                    'output_coverage_percent': 'float'
                }
            },
            'sql_results/summary_metrics/zone_metrics.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'zone_name': 'str',
                    'floor_area': 'float',
                    'volume': 'float'
                }
            },
            'sql_results/output_validation/validation_results.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'total_requested': 'int',
                    'found': 'int',
                    'coverage': 'float'
                }
            },
            'sql_results/output_validation/missing_outputs.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'variable': 'str',
                    'key': 'str',
                    'frequency': 'str'
                }
            },
            'sql_results/output_validation/existing_outputs.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'variable': 'str',
                    'key': 'str',
                    'frequency': 'str',
                    'found_in_sql': 'bool',
                    'has_data': 'bool'
                }
            },
            'sql_results/output_validation/available_outputs.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'variant_id': 'str',
                    'ReportDataDictionaryIndex': 'int',
                    'VariableName': 'str',
                    'KeyValue': 'str',
                    'Units': 'str',
                    'ReportingFrequency': 'str',
                    'DataPoints': 'int',
                    'HasData': 'bool',
                    'Category': 'str'
                }
            },
            
            # ===== RELATIONSHIPS =====
            'relationships/zone_mappings.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'idf_zone_name': 'str',
                    'sql_zone_name': 'str',
                    'zone_type': 'str',
                    'multiplier': 'int',
                    'mapping_confidence': 'float'
                }
            },
            'relationships/equipment_assignments.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    'equipment_name': 'str',
                    'equipment_type': 'str',
                    'assigned_zone': 'str',
                    'schedule': 'str'
                }
            },
            
            # ===== ANALYSIS READY =====
            'analysis_ready/parameter_matrix.parquet': {
                'type': 'parquet',
                'columns': {
                    'building_id': 'str',
                    # Dynamic columns based on categories
                    # Examples:
                    'geometry_volume': 'float',
                    'geometry_floor_area': 'float',
                    'geometry_ceiling_height': 'float',
                    'materials_thickness': 'float',
                    'materials_conductivity': 'float',
                    'hvac_cooling_capacity': 'float',
                    'hvac_heating_capacity': 'float'
                }
            },
            'analysis_ready/output_analysis/coverage_summary.json': {
                'type': 'json',
                'fields': {
                    'total_buildings': 'int',
                    'average_coverage': 'float',
                    'min_coverage': 'float',
                    'max_coverage': 'float',
                    'perfect_coverage_count': 'int',
                    'buildings_with_issues': 'int',
                    'top_missing_outputs': 'dict',
                    'top_existing_outputs': 'dict'
                }
            }
        }
    
    def convert_to_native_types(self, obj: Any) -> Any:
        """Convert numpy/pandas types to native Python types."""
        if isinstance(obj, dict):
            return {k: self.convert_to_native_types(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self.convert_to_native_types(item) for item in obj]
        elif isinstance(obj, pd.Timestamp):
            return obj.isoformat()
        elif isinstance(obj, datetime):
            return obj.isoformat()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            if np.isnan(obj):
                return None
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif pd.isna(obj):
            return None
        else:
            return obj
    
    def check_file_exists(self, file_path: str) -> Tuple[bool, Optional[str]]:
        """Check if file exists, handling wildcards."""
        full_path = self.parsed_data_dir / file_path
        
        if '*' in file_path:
            parent = full_path.parent
            pattern = full_path.name
            if parent.exists():
                matching_files = list(parent.glob(pattern))
                if matching_files:
                    return True, str(matching_files[0])
            return False, None
        
        return full_path.exists(), str(full_path) if full_path.exists() else None
    
    def check_json_file(self, file_path: str, expected_fields: Dict[str, str]) -> Dict:
        """Check a JSON file's structure."""
        exists, abs_path = self.check_file_exists(file_path)
        
        result = {
            'file': file_path,
            'exists': exists,
            'path': abs_path,
            'fields': {},
            'extra_fields': {}
        }
        
        if exists and abs_path:
            try:
                with open(abs_path, 'r') as f:
                    data = json.load(f)
                
                # For nested JSON, check top-level keys
                if file_path == 'metadata/category_schemas.json':
                    # This has category names as keys
                    result['categories'] = list(data.keys())
                    result['category_count'] = len(data)
                    # Check first category structure
                    if data:
                        first_cat = next(iter(data.values()))
                        for field, expected_type in expected_fields.items():
                            if field in first_cat:
                                result['fields'][field] = {
                                    'exists': True,
                                    'type': type(first_cat[field]).__name__
                                }
                            else:
                                result['fields'][field] = {
                                    'exists': False,
                                    'type': None
                                }
                else:
                    # Regular field checking
                    for field, expected_type in expected_fields.items():
                        if field in data:
                            result['fields'][field] = {
                                'exists': True,
                                'type': type(data[field]).__name__,
                                'value': self.convert_to_native_types(data[field]) if not isinstance(data[field], (dict, list)) else f"{expected_type} with {len(data[field])} items"
                            }
                        else:
                            result['fields'][field] = {
                                'exists': False,
                                'type': None
                            }
                    
                    # Check for extra fields
                    for field in data:
                        if field not in expected_fields:
                            result['extra_fields'][field] = {
                                'type': type(data[field]).__name__,
                                'value': self.convert_to_native_types(data[field]) if not isinstance(data[field], (dict, list)) else f"{type(data[field]).__name__} with {len(data[field])} items"
                            }
                            
            except Exception as e:
                result['error'] = str(e)
        
        return result
    
    def check_parquet_file(self, file_path: str, expected_columns: Dict[str, any]) -> Dict:
        """Check a parquet file's structure."""
        exists, abs_path = self.check_file_exists(file_path)
        
        result = {
            'file': file_path,
            'exists': exists,
            'path': abs_path,
            'row_count': 0,
            'columns': {},
            'extra_columns': {}
        }
        
        if exists and abs_path:
            try:
                df = pd.read_parquet(abs_path)
                result['row_count'] = len(df)
                
                # Store discovered columns
                if file_path not in self.discovered_columns:
                    self.discovered_columns[file_path] = {}
                
                # Get actual columns
                actual_columns = {col: self.get_dtype_string(df[col].dtype) for col in df.columns}
                
                # Check expected columns
                for exp_col, exp_type in expected_columns.items():
                    if exp_col in actual_columns:
                        actual_type = actual_columns[exp_col]
                        
                        # Handle multiple expected types
                        if isinstance(exp_type, list):
                            type_match = actual_type in exp_type
                        else:
                            type_match = (actual_type == exp_type) or \
                                       (exp_type == 'str' and actual_type in ['object', 'str'])
                        
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': actual_type,
                            'exists': True,
                            'type_match': type_match,
                            'null_count': int(df[exp_col].isnull().sum()),
                            'unique_count': int(df[exp_col].nunique()),
                            'sample_values': self.get_sample_values(df[exp_col])
                        }
                    else:
                        result['columns'][exp_col] = {
                            'expected_type': exp_type,
                            'actual_type': None,
                            'exists': False,
                            'type_match': False
                        }
                
                # Check for extra columns
                for col in actual_columns:
                    if col not in expected_columns:
                        result['extra_columns'][col] = {
                            'actual_type': actual_columns[col],
                            'null_count': int(df[col].isnull().sum()),
                            'unique_count': int(df[col].nunique()),
                            'sample_values': self.get_sample_values(df[col], 3),
                            'stats': self.get_column_stats(df[col])
                        }
                        self.discovered_columns[file_path][col] = result['extra_columns'][col]
                        
            except Exception as e:
                result['error'] = str(e)
        
        return result
    
    def get_dtype_string(self, dtype) -> str:
        """Convert numpy/pandas dtype to string."""
        dtype_str = str(dtype)
        
        if 'int' in dtype_str:
            return 'int'
        elif 'float' in dtype_str:
            return 'float'
        elif 'object' in dtype_str or 'string' in dtype_str:
            return 'str'
        elif 'datetime' in dtype_str:
            return 'datetime'
        elif 'bool' in dtype_str:
            return 'bool'
        else:
            return dtype_str
    
    def get_sample_values(self, series: pd.Series, n_samples: int = 5) -> List:
        """Get sample values from a series."""
        unique_vals = series.dropna().unique()
        if len(unique_vals) <= n_samples:
            return [self.convert_to_native_types(val) for val in unique_vals]
        else:
            samples = []
            if pd.api.types.is_numeric_dtype(series):
                samples.append(self.convert_to_native_types(series.min()))
                samples.append(self.convert_to_native_types(series.max()))
                remaining = n_samples - 2
                if remaining > 0 and len(series) > 0:
                    random_samples = series.dropna().sample(n=min(remaining, len(series))).tolist()
                    samples.extend([self.convert_to_native_types(val) for val in random_samples[:remaining]])
            else:
                samples = [self.convert_to_native_types(val) for val in unique_vals[:n_samples]]
            return samples
    
    def get_column_stats(self, series: pd.Series) -> Dict:
        """Get statistics for a column."""
        stats = {
            'total_count': int(len(series)),
            'non_null_count': int(series.count()),
            'null_percentage': float((series.isnull().sum() / len(series) * 100) if len(series) > 0 else 0)
        }
        
        if pd.api.types.is_numeric_dtype(series) and series.count() > 0:
            stats.update({
                'mean': self.convert_to_native_types(series.mean()),
                'std': self.convert_to_native_types(series.std()),
                'min': self.convert_to_native_types(series.min()),
                'max': self.convert_to_native_types(series.max()),
                'q25': self.convert_to_native_types(series.quantile(0.25)),
                'q50': self.convert_to_native_types(series.quantile(0.50)),
                'q75': self.convert_to_native_types(series.quantile(0.75))
            })
        elif pd.api.types.is_string_dtype(series) or series.dtype == 'object':
            value_counts = series.value_counts()
            stats['unique_values'] = int(len(value_counts))
            if len(value_counts) > 0:
                stats['most_common'] = {str(k): int(v) for k, v in value_counts.head(5).items()}
        
        return stats
    
    def check_all_files(self) -> pd.DataFrame:
        """Check all expected files."""
        print(f"Checking parser data structure in: {self.parsed_data_dir}\n")
        
        all_results = []
        
        for file_path, expected_info in self.expected_structure.items():
            print(f"Checking: {file_path}")
            
            if expected_info['type'] == 'json':
                result = self.check_json_file(file_path, expected_info.get('fields', {}))
                
                if result['exists']:
                    # Add JSON field results
                    for field_name, field_info in result['fields'].items():
                        all_results.append({
                            'File': file_path.split('/')[-1],
                            'Path': file_path,
                            'File_Type': 'JSON',
                            'File_Exists': '✓',
                            'Rows': 'N/A',
                            'Column': field_name,
                            'Column_Type': 'Expected',
                            'Expected_Type': expected_info['fields'][field_name],
                            'Actual_Type': field_info['type'],
                            'Column_Exists': '✓' if field_info['exists'] else '✗',
                            'Type_Match': '✓' if field_info['exists'] else '✗',
                            'Null_Count': 'N/A',
                            'Unique_Values': 'N/A',
                            'Sample_Values': str(field_info.get('value', 'N/A'))[:50],
                            'Status': 'OK' if field_info['exists'] else 'ISSUE'
                        })
                    
                    # Add extra fields
                    for field_name, field_info in result.get('extra_fields', {}).items():
                        all_results.append({
                            'File': file_path.split('/')[-1],
                            'Path': file_path,
                            'File_Type': 'JSON',
                            'File_Exists': '✓',
                            'Rows': 'N/A',
                            'Column': field_name,
                            'Column_Type': 'DISCOVERED',
                            'Expected_Type': 'N/A',
                            'Actual_Type': field_info['type'],
                            'Column_Exists': '✓',
                            'Type_Match': 'N/A',
                            'Null_Count': 'N/A',
                            'Unique_Values': 'N/A',
                            'Sample_Values': str(field_info.get('value', 'N/A'))[:50],
                            'Status': 'EXTRA'
                        })
                else:
                    all_results.append({
                        'File': file_path.split('/')[-1],
                        'Path': file_path,
                        'File_Type': 'JSON',
                        'File_Exists': '✗',
                        'Rows': 0,
                        'Column': 'N/A',
                        'Column_Type': 'N/A',
                        'Expected_Type': 'N/A',
                        'Actual_Type': 'N/A',
                        'Column_Exists': 'N/A',
                        'Type_Match': 'N/A',
                        'Null_Count': None,
                        'Unique_Values': None,
                        'Sample_Values': 'N/A',
                        'Status': 'MISSING FILE'
                    })
                    
            else:  # Parquet files
                result = self.check_parquet_file(file_path, expected_info.get('columns', {}))
                
                if result['exists']:
                    # Add expected columns
                    for col_name, col_info in result['columns'].items():
                        all_results.append({
                            'File': file_path.split('/')[-1],
                            'Path': file_path,
                            'File_Type': 'Parquet',
                            'File_Exists': '✓',
                            'Rows': result['row_count'],
                            'Column': col_name,
                            'Column_Type': 'Expected',
                            'Expected_Type': col_info['expected_type'],
                            'Actual_Type': col_info.get('actual_type', 'N/A'),
                            'Column_Exists': '✓' if col_info['exists'] else '✗',
                            'Type_Match': '✓' if col_info.get('type_match', False) else '✗',
                            'Null_Count': col_info.get('null_count', 'N/A'),
                            'Unique_Values': col_info.get('unique_count', 'N/A'),
                            'Sample_Values': str(col_info.get('sample_values', [])[:3]),
                            'Status': 'OK' if col_info['exists'] and col_info.get('type_match', False) else 'ISSUE'
                        })
                    
                    # Add extra columns
                    for col_name, col_info in result['extra_columns'].items():
                        all_results.append({
                            'File': file_path.split('/')[-1],
                            'Path': file_path,
                            'File_Type': 'Parquet',
                            'File_Exists': '✓',
                            'Rows': result['row_count'],
                            'Column': col_name,
                            'Column_Type': 'DISCOVERED',
                            'Expected_Type': 'N/A',
                            'Actual_Type': col_info['actual_type'],
                            'Column_Exists': '✓',
                            'Type_Match': 'N/A',
                            'Null_Count': col_info['null_count'],
                            'Unique_Values': col_info['unique_count'],
                            'Sample_Values': str(col_info['sample_values'][:3]),
                            'Status': 'EXTRA'
                        })
                else:
                    all_results.append({
                        'File': file_path.split('/')[-1],
                        'Path': file_path,
                        'File_Type': 'Parquet',
                        'File_Exists': '✗',
                        'Rows': 0,
                        'Column': 'N/A',
                        'Column_Type': 'N/A',
                        'Expected_Type': 'N/A',
                        'Actual_Type': 'N/A',
                        'Column_Exists': 'N/A',
                        'Type_Match': 'N/A',
                        'Null_Count': None,
                        'Unique_Values': None,
                        'Sample_Values': 'N/A',
                        'Status': 'MISSING FILE'
                    })
        
        return pd.DataFrame(all_results)
    
    def create_summary_report(self, df_results: pd.DataFrame) -> Dict:
        """Create summary report."""
        summary = {
            'check_timestamp': datetime.now().isoformat(),
            'parsed_data_dir': str(self.parsed_data_dir),
            'total_files_expected': len(self.expected_structure),
            'files_found': len(df_results[df_results['File_Exists'] == '✓']['Path'].unique()),
            'files_missing': len(df_results[df_results['File_Exists'] == '✗']['Path'].unique()),
            'total_expected_columns': len(df_results[df_results['Column_Type'] == 'Expected']),
            'expected_columns_found': len(df_results[(df_results['Column_Type'] == 'Expected') & 
                                                    (df_results['Column_Exists'] == '✓')]),
            'total_discovered_columns': len(df_results[df_results['Column_Type'] == 'DISCOVERED']),
            'total_issues': len(df_results[df_results['Status'].isin(['ISSUE', 'MISSING FILE'])]),
            'missing_files': list(df_results[df_results['File_Exists'] == '✗']['Path'].unique()),
            'data_categories': {
                'metadata': False,
                'idf_data': False,
                'sql_results': False,
                'relationships': False,
                'analysis_ready': False,
                'output_validation': False
            },
            'parser_completeness': 0.0
        }
        
        # Check data categories
        files_exist = df_results[df_results['File_Exists'] == '✓']['Path'].unique()
        summary['data_categories']['metadata'] = any('metadata/' in f for f in files_exist)
        summary['data_categories']['idf_data'] = any('idf_data/' in f for f in files_exist)
        summary['data_categories']['sql_results'] = any('sql_results/' in f for f in files_exist)
        summary['data_categories']['relationships'] = any('relationships/' in f for f in files_exist)
        summary['data_categories']['analysis_ready'] = any('analysis_ready/' in f for f in files_exist)
        summary['data_categories']['output_validation'] = any('output_validation/' in f for f in files_exist)
        
        # Calculate completeness
        categories_complete = sum(summary['data_categories'].values())
        summary['parser_completeness'] = (categories_complete / len(summary['data_categories'])) * 100
        
        return summary
    
    def save_results(self, df_results: pd.DataFrame, summary: Dict):
        """Save results to files."""
        output_dir = self.parsed_data_dir / 'parser_data_check'
        output_dir.mkdir(exist_ok=True)
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save detailed results
        df_results.to_csv(output_dir / f'parser_check_details_{timestamp}.csv', index=False)
        df_results.to_excel(output_dir / f'parser_check_details_{timestamp}.xlsx', index=False)
        
        # Save summary
        with open(output_dir / f'parser_check_summary_{timestamp}.json', 'w') as f:
            json.dump(summary, f, indent=2)
        
        # Save discovered columns
        if self.discovered_columns:
            discovered_native = self.convert_to_native_types(self.discovered_columns)
            with open(output_dir / f'discovered_columns_{timestamp}.json', 'w') as f:
                json.dump(discovered_native, f, indent=2)
        
        # Create markdown report
        self.create_markdown_report(output_dir / f'parser_report_{timestamp}.md', df_results, summary)
        
        print(f"\nResults saved to: {output_dir}")

    def create_markdown_report(self, output_path: Path, df_results: pd.DataFrame, summary: Dict):
        """Create markdown report."""
        with open(output_path, 'w') as f:
            f.write("# Parser Data Structure Check Report\n\n")
            f.write(f"**Generated:** {summary['check_timestamp']}\n")
            f.write(f"**Directory:** `{summary['parsed_data_dir']}`\n\n")
            
            f.write("## Summary\n\n")
            f.write(f"- **Files Expected:** {summary['total_files_expected']}\n")
            f.write(f"- **Files Found:** {summary['files_found']}\n")
            f.write(f"- **Files Missing:** {summary['files_missing']}\n")
            f.write(f"- **Parser Completeness:** {summary['parser_completeness']:.1f}%\n\n")
            
            f.write("## Data Categories Status\n\n")
            for category, status in summary['data_categories'].items():
                f.write(f"- **{category}:** {'✓' if status else '✗'}\n")
            f.write("\n")
            
            if summary['missing_files']:
                f.write("## Missing Files\n\n")
                for file in summary['missing_files']:
                    f.write(f"- `{file}`\n")
                f.write("\n")

# Main function
def check_parser_data_structure(parsed_data_dir: str):
    """Check parser data structure."""
    checker = ParserDataChecker(parsed_data_dir)
    
    # Run checks
    df_results = checker.check_all_files()
    summary = checker.create_summary_report(df_results)
    
    # Print summary
    print("\n" + "="*80)
    print("PARSER DATA STRUCTURE CHECK SUMMARY")
    print("="*80)
    print(f"Directory: {parsed_data_dir}")
    print(f"Files found: {summary['files_found']}/{summary['total_files_expected']}")
    print(f"Parser completeness: {summary['parser_completeness']:.1f}%")
    print("\nData Categories:")
    for category, status in summary['data_categories'].items():
        print(f"  - {category}: {'✓' if status else '✗'}")
    
    # Save results
    checker.save_results(df_results, summary)
    
    return df_results, summary

# Example usage
if __name__ == "__main__":
    # Example path - update with your actual parsed data directory
    parsed_dir = r"D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff\parsed_modified_results"
    df_results, summary = check_parser_data_structure(parsed_dir)

Checking parser data structure in: D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff\parsed_modified_results

Checking: metadata/project_manifest.json
Checking: metadata/building_registry.parquet
Checking: metadata/category_schemas.json
Checking: metadata/output_documentation.json
Checking: idf_data/by_category/simulation_control.parquet
Checking: idf_data/by_category/site_location.parquet
Checking: idf_data/by_category/geometry_zones.parquet
Checking: idf_data/by_category/geometry_surfaces.parquet
Checking: idf_data/by_category/materials_constructions.parquet
Checking: idf_data/by_category/materials_materials.parquet
Checking: idf_data/by_category/hvac_equipment.parquet
Checking: idf_data/by_category/hvac_thermostats.parquet
Checking: idf_data/by_category/outputs_all.parquet
Checking: idf_data/by_category/ventilation.parquet
Checking: idf_data/by_category/infiltration.parquet
Checking: idf_data/by_category/lighting.parquet
Checking: idf_data/by_category/equ

## v2

## v3

In [15]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any, Set
import json
from datetime import datetime
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

class EnhancedParserDataChecker:
    def __init__(self, base_parsed_dir: str, modified_parsed_dir: str = None):
        self.base_parsed_dir = Path(base_parsed_dir)
        self.modified_parsed_dir = Path(modified_parsed_dir) if modified_parsed_dir else None
        self.results = {}
        self.discovered_columns = {}
        
        # Color codes for terminal output
        self.COLORS = {
            'GREEN': '\033[92m',
            'RED': '\033[91m',
            'YELLOW': '\033[93m',
            'BLUE': '\033[94m',
            'PURPLE': '\033[95m',
            'CYAN': '\033[96m',
            'BOLD': '\033[1m',
            'END': '\033[0m'
        }
        
        # Expected structure (same as before but reorganized)
        self.expected_structure = self._get_expected_structure()
        
    def _get_expected_structure(self) -> Dict:
        """Get expected file structure organized by category."""
        return {
            'metadata': {
                'files': {
                    'metadata/project_manifest.json': {
                        'type': 'json',
                        'description': 'Project overview and configuration',
                        'critical': True,
                        'fields': {
                            'project_id': 'str',
                            'created': 'str',
                            'total_buildings': 'int',
                            'categories_tracked': 'list',
                            'data_version': 'str'
                        }
                    },
                    'metadata/building_registry.parquet': {
                        'type': 'parquet',
                        'description': 'Central registry of all buildings',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'ogc_fid': ['str', 'int'],
                            'idf_path': 'str',
                            'sql_path': 'str',
                            'zone_count': 'int',
                            'variant_id': 'str'
                        }
                    },
                    'metadata/category_schemas.json': {
                        'type': 'json',
                        'description': 'Schema definitions for each category',
                        'critical': False,
                        'fields': {
                            'columns': 'list',
                            'dtypes': 'dict',
                            'row_count': 'int',
                            'building_count': 'int'
                        }
                    }
                }
            },
            'idf_data': {
                'files': {
                    'idf_data/by_category/geometry_zones.parquet': {
                        'type': 'parquet',
                        'description': 'Zone geometry and properties',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'object_name': 'str',
                            'zone_name': 'str',
                            'volume': ['str', 'float'],
                            'floor_area': ['str', 'float'],
                            'ceiling_height': ['str', 'float']
                        }
                    },
                    'idf_data/by_category/hvac_equipment.parquet': {
                        'type': 'parquet',
                        'description': 'HVAC equipment specifications',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'object_name': 'str',
                            'zone_name': 'str',
                            'maximum_heating_supply_air_temperature': ['str', 'float'],
                            'minimum_cooling_supply_air_temperature': ['str', 'float']
                        }
                    },
                    'idf_data/by_category/lighting.parquet': {
                        'type': 'parquet',
                        'description': 'Lighting loads and schedules',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'zone_name': 'str',
                            'watts_per_zone_floor_area': ['str', 'float'],
                            'fraction_radiant': ['str', 'float']
                        }
                    },
                    'idf_data/by_category/outputs_all.parquet': {
                        'type': 'parquet',
                        'description': 'Consolidated output definitions',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'output_type': 'str',
                            'name': 'str',
                            'reporting_frequency': 'str'
                        }
                    }
                }
            },
            'sql_results': {
                'files': {
                    'sql_results/timeseries/hourly/*_2020.parquet': {
                        'type': 'parquet',
                        'description': 'Hourly time series data',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'DateTime': 'datetime',
                            'Zone': 'str',
                            'Variable': 'str',
                            'Value': 'float'
                        }
                    },
                    'sql_results/summary_metrics/building_metrics.parquet': {
                        'type': 'parquet',
                        'description': 'Building-level summary metrics',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'total_floor_area': 'float',
                            'total_volume': 'float',
                            'zone_count': 'int'
                        }
                    }
                }
            },
            'relationships': {
                'files': {
                    'relationships/zone_mappings.parquet': {
                        'type': 'parquet',
                        'description': 'IDF to SQL zone name mappings',
                        'critical': True,
                        'columns': {
                            'building_id': 'str',
                            'idf_zone_name': 'str',
                            'sql_zone_name': 'str'
                        }
                    }
                }
            },
            'output_validation': {
                'files': {
                    'sql_results/output_validation/validation_results.parquet': {
                        'type': 'parquet',
                        'description': 'Output completeness validation',
                        'critical': False,
                        'columns': {
                            'building_id': 'str',
                            'total_requested': 'int',
                            'found': 'int',
                            'coverage': 'float'
                        }
                    }
                }
            }
        }
    
    def check_file_exists(self, base_path: Path, file_path: str) -> Tuple[bool, Optional[str]]:
        """Check if file exists, handling wildcards."""
        full_path = base_path / file_path
        
        if '*' in file_path:
            parent = full_path.parent
            pattern = full_path.name
            if parent.exists():
                matching_files = list(parent.glob(pattern))
                if matching_files:
                    return True, str(matching_files[0])
            return False, None
        
        return full_path.exists(), str(full_path) if full_path.exists() else None
    
    def check_directory(self, dir_path: Path, dir_name: str) -> Dict:
        """Check a single directory comprehensively."""
        results = {
            'directory': dir_name,
            'path': str(dir_path),
            'exists': dir_path.exists(),
            'categories': {},
            'summary': {
                'total_expected': 0,
                'found': 0,
                'missing': 0,
                'critical_missing': 0,
                'extra_files': 0
            }
        }
        
        if not dir_path.exists():
            return results
        
        # Check each category
        for category, cat_info in self.expected_structure.items():
            cat_results = {
                'files': {},
                'found': 0,
                'missing': 0,
                'critical_missing': 0
            }
            
            for file_path, file_info in cat_info['files'].items():
                exists, abs_path = self.check_file_exists(dir_path, file_path)
                
                file_result = {
                    'exists': exists,
                    'path': abs_path,
                    'type': file_info['type'],
                    'description': file_info['description'],
                    'critical': file_info.get('critical', False),
                    'status': 'OK' if exists else ('CRITICAL' if file_info.get('critical', False) else 'MISSING')
                }
                
                if exists:
                    cat_results['found'] += 1
                    results['summary']['found'] += 1
                    
                    # Check file contents
                    if file_info['type'] == 'parquet':
                        file_result.update(self._check_parquet_contents(abs_path, file_info.get('columns', {})))
                    elif file_info['type'] == 'json':
                        file_result.update(self._check_json_contents(abs_path, file_info.get('fields', {})))
                else:
                    cat_results['missing'] += 1
                    results['summary']['missing'] += 1
                    if file_info.get('critical', False):
                        cat_results['critical_missing'] += 1
                        results['summary']['critical_missing'] += 1
                
                cat_results['files'][file_path] = file_result
                results['summary']['total_expected'] += 1
            
            results['categories'][category] = cat_results
        
        # Check for extra files
        results['extra_files'] = self._find_extra_files(dir_path)
        results['summary']['extra_files'] = len(results['extra_files'])
        
        return results
    
    def _check_parquet_contents(self, file_path: str, expected_columns: Dict) -> Dict:
        """Check parquet file contents."""
        try:
            df = pd.read_parquet(file_path)
            
            result = {
                'row_count': len(df),
                'actual_columns': list(df.columns),
                'column_check': {},
                'extra_columns': [],
                'missing_columns': []
            }
            
            # Check expected columns
            for col, dtype in expected_columns.items():
                if col in df.columns:
                    actual_dtype = self._get_dtype_string(df[col].dtype)
                    type_match = self._check_type_match(actual_dtype, dtype)
                    
                    result['column_check'][col] = {
                        'exists': True,
                        'expected_type': dtype,
                        'actual_type': actual_dtype,
                        'type_match': type_match,
                        'null_count': int(df[col].isnull().sum()),
                        'unique_count': int(df[col].nunique())
                    }
                else:
                    result['missing_columns'].append(col)
                    result['column_check'][col] = {
                        'exists': False,
                        'expected_type': dtype
                    }
            
            # Find extra columns
            for col in df.columns:
                if col not in expected_columns:
                    result['extra_columns'].append(col)
            
            # Check for variant tracking in modified results
            if 'variant_id' in df.columns:
                result['has_variant_tracking'] = True
                result['variant_ids'] = df['variant_id'].unique().tolist()
            
            return result
            
        except Exception as e:
            return {'error': str(e)}
    
    def _check_json_contents(self, file_path: str, expected_fields: Dict) -> Dict:
        """Check JSON file contents."""
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            result = {
                'field_check': {},
                'extra_fields': [],
                'missing_fields': []
            }
            
            # Check expected fields
            for field, dtype in expected_fields.items():
                if field in data:
                    result['field_check'][field] = {
                        'exists': True,
                        'expected_type': dtype,
                        'actual_type': type(data[field]).__name__
                    }
                else:
                    result['missing_fields'].append(field)
                    result['field_check'][field] = {
                        'exists': False,
                        'expected_type': dtype
                    }
            
            # Find extra fields
            for field in data:
                if field not in expected_fields:
                    result['extra_fields'].append(field)
            
            return result
            
        except Exception as e:
            return {'error': str(e)}
    
    def _find_extra_files(self, dir_path: Path) -> List[str]:
        """Find files not in expected structure."""
        extra_files = []
        
        # Get all expected file patterns
        expected_patterns = set()
        for category in self.expected_structure.values():
            for file_path in category['files'].keys():
                if '*' in file_path:
                    # Handle wildcards
                    parts = file_path.split('/')
                    pattern = parts[-1]
                    parent = '/'.join(parts[:-1])
                    expected_patterns.add((parent, pattern))
                else:
                    expected_patterns.add((file_path, None))
        
        # Walk directory and find extras
        for root, dirs, files in os.walk(dir_path):
            rel_root = Path(root).relative_to(dir_path)
            
            for file in files:
                if file.endswith(('.parquet', '.json')):
                    rel_path = str(rel_root / file).replace('\\', '/')
                    
                    # Check if this matches any expected pattern
                    is_expected = False
                    for expected, pattern in expected_patterns:
                        if pattern:  # Wildcard pattern
                            if str(rel_root).replace('\\', '/') == expected.split('/')[0]:
                                import fnmatch
                                if fnmatch.fnmatch(file, pattern):
                                    is_expected = True
                                    break
                        else:  # Exact match
                            if rel_path == expected:
                                is_expected = True
                                break
                    
                    if not is_expected:
                        extra_files.append(rel_path)
        
        return extra_files
    
    def _get_dtype_string(self, dtype) -> str:
        """Convert numpy/pandas dtype to string."""
        dtype_str = str(dtype)
        
        if 'int' in dtype_str:
            return 'int'
        elif 'float' in dtype_str:
            return 'float'
        elif 'object' in dtype_str or 'string' in dtype_str:
            return 'str'
        elif 'datetime' in dtype_str:
            return 'datetime'
        elif 'bool' in dtype_str:
            return 'bool'
        else:
            return dtype_str
    
    def _check_type_match(self, actual_type: str, expected_type) -> bool:
        """Check if actual type matches expected type(s)."""
        if isinstance(expected_type, list):
            return actual_type in expected_type
        else:
            return actual_type == expected_type or \
                   (expected_type == 'str' and actual_type in ['object', 'str'])
    
    def compare_base_and_modified(self) -> Dict:
        """Compare base and modified results."""
        comparison = {
            'base': self.check_directory(self.base_parsed_dir, 'base'),
            'modified': self.check_directory(self.modified_parsed_dir, 'modified') if self.modified_parsed_dir else None,
            'differences': {}
        }
        
        if comparison['modified']:
            # Find differences
            for category in self.expected_structure:
                cat_diff = {
                    'base_only': [],
                    'modified_only': [],
                    'both_missing': [],
                    'content_differences': {}
                }
                
                base_files = comparison['base']['categories'].get(category, {}).get('files', {})
                mod_files = comparison['modified']['categories'].get(category, {}).get('files', {})
                
                for file_path in set(list(base_files.keys()) + list(mod_files.keys())):
                    base_exists = base_files.get(file_path, {}).get('exists', False)
                    mod_exists = mod_files.get(file_path, {}).get('exists', False)
                    
                    if base_exists and not mod_exists:
                        cat_diff['base_only'].append(file_path)
                    elif not base_exists and mod_exists:
                        cat_diff['modified_only'].append(file_path)
                    elif not base_exists and not mod_exists:
                        cat_diff['both_missing'].append(file_path)
                    elif base_exists and mod_exists:
                        # Check content differences
                        base_content = base_files[file_path]
                        mod_content = mod_files[file_path]
                        
                        if 'row_count' in base_content and 'row_count' in mod_content:
                            if base_content['row_count'] != mod_content['row_count']:
                                cat_diff['content_differences'][file_path] = {
                                    'base_rows': base_content['row_count'],
                                    'modified_rows': mod_content['row_count']
                                }
                
                comparison['differences'][category] = cat_diff
        
        return comparison
    
    def create_visual_summary(self, results: Dict):
        """Create a visual summary of the results."""
        print("\n" + "="*100)
        print(f"{self.COLORS['BOLD']}PARSER DATA STRUCTURE CHECK - VISUAL SUMMARY{self.COLORS['END']}")
        print("="*100)
        
        # Overall summary
        if 'base' in results:
            self._print_directory_summary(results['base'], 'BASE PARSED DATA')
        
        if results.get('modified'):
            print("\n" + "-"*100 + "\n")
            self._print_directory_summary(results['modified'], 'MODIFIED PARSED DATA')
        
        # Comparison summary if both exist
        if results.get('modified') and results.get('differences'):
            print("\n" + "-"*100)
            print(f"\n{self.COLORS['BOLD']}COMPARISON SUMMARY{self.COLORS['END']}")
            print("-"*50)
            
            for category, diffs in results['differences'].items():
                if any([diffs['base_only'], diffs['modified_only'], diffs['both_missing']]):
                    print(f"\n{self.COLORS['CYAN']}{category.upper()}{self.COLORS['END']}")
                    
                    if diffs['base_only']:
                        print(f"  {self.COLORS['YELLOW']}Base only:{self.COLORS['END']} {len(diffs['base_only'])} files")
                    if diffs['modified_only']:
                        print(f"  {self.COLORS['BLUE']}Modified only:{self.COLORS['END']} {len(diffs['modified_only'])} files")
                    if diffs['both_missing']:
                        print(f"  {self.COLORS['RED']}Both missing:{self.COLORS['END']} {len(diffs['both_missing'])} files")
    
    def _print_directory_summary(self, dir_results: Dict, title: str):
        """Print summary for a single directory."""
        print(f"\n{self.COLORS['BOLD']}{title}{self.COLORS['END']}")
        print(f"Path: {dir_results['path']}")
        
        if not dir_results['exists']:
            print(f"{self.COLORS['RED']}DIRECTORY DOES NOT EXIST{self.COLORS['END']}")
            return
        
        summary = dir_results['summary']
        
        # Overall status bar
        total = summary['total_expected']
        found = summary['found']
        missing = summary['missing']
        critical = summary['critical_missing']
        
        print(f"\nOverall: {found}/{total} files found")
        
        # Visual progress bar
        bar_length = 50
        found_length = int((found / total) * bar_length) if total > 0 else 0
        critical_length = int((critical / total) * bar_length) if total > 0 else 0
        
        bar = f"[{self.COLORS['GREEN']}{'█' * found_length}{self.COLORS['END']}"
        bar += f"{self.COLORS['RED']}{'▓' * critical_length}{self.COLORS['END']}"
        bar += f"{self.COLORS['YELLOW']}{'░' * (bar_length - found_length - critical_length)}{self.COLORS['END']}]"
        
        print(bar)
        print(f"Legend: {self.COLORS['GREEN']}█ Found{self.COLORS['END']} | "
              f"{self.COLORS['RED']}▓ Critical Missing{self.COLORS['END']} | "
              f"{self.COLORS['YELLOW']}░ Optional Missing{self.COLORS['END']}")
        
        # Category breakdown
        print("\nCategory Status:")
        for category, cat_results in dir_results['categories'].items():
            cat_found = cat_results['found']
            cat_total = len(cat_results['files'])
            cat_critical = cat_results['critical_missing']
            
            # Status icon
            if cat_critical > 0:
                status = f"{self.COLORS['RED']}✗{self.COLORS['END']}"
            elif cat_found == cat_total:
                status = f"{self.COLORS['GREEN']}✓{self.COLORS['END']}"
            else:
                status = f"{self.COLORS['YELLOW']}⚠{self.COLORS['END']}"
            
            print(f"  {status} {category:<20} {cat_found}/{cat_total} files")
            
            # Show critical missing files
            if cat_critical > 0:
                for file_path, file_info in cat_results['files'].items():
                    if file_info['status'] == 'CRITICAL':
                        print(f"      {self.COLORS['RED']}↳ Missing: {file_path.split('/')[-1]}{self.COLORS['END']}")
        
        # Extra files
        if dir_results['summary']['extra_files'] > 0:
            print(f"\n{self.COLORS['PURPLE']}Extra files found: {dir_results['summary']['extra_files']}{self.COLORS['END']}")
            for extra in dir_results['extra_files'][:5]:
                print(f"  + {extra}")
            if len(dir_results['extra_files']) > 5:
                print(f"  ... and {len(dir_results['extra_files']) - 5} more")
    
    def create_detailed_report(self, results: Dict) -> pd.DataFrame:
        """Create detailed report as DataFrame."""
        rows = []
        
        for dataset in ['base', 'modified']:
            if dataset in results and results[dataset]['exists']:
                dir_results = results[dataset]
                
                for category, cat_results in dir_results['categories'].items():
                    for file_path, file_info in cat_results['files'].items():
                        row = {
                            'Dataset': dataset.upper(),
                            'Category': category,
                            'File': file_path.split('/')[-1],
                            'Path': file_path,
                            'Type': file_info['type'],
                            'Critical': '✓' if file_info.get('critical', False) else '',
                            'Exists': '✓' if file_info['exists'] else '✗',
                            'Status': file_info['status'],
                            'Description': file_info['description']
                        }
                        
                        if file_info['exists']:
                            if 'row_count' in file_info:
                                row['Rows'] = file_info['row_count']
                            if 'missing_columns' in file_info:
                                row['Missing_Columns'] = len(file_info['missing_columns'])
                            if 'extra_columns' in file_info:
                                row['Extra_Columns'] = len(file_info['extra_columns'])
                            if 'has_variant_tracking' in file_info:
                                row['Has_Variants'] = '✓'
                                row['Variant_Count'] = len(file_info.get('variant_ids', []))
                        
                        rows.append(row)
        
        return pd.DataFrame(rows)
    
    def analyze_variant_tracking(self, results: Dict) -> Dict:
        """Analyze variant tracking in modified results."""
        variant_analysis = {
            'has_variant_tracking': False,
            'files_with_variants': [],
            'unique_variants': set(),
            'building_variant_mapping': defaultdict(set)
        }
        
        if not results.get('modified') or not results['modified']['exists']:
            return variant_analysis
        
        mod_results = results['modified']
        
        for category, cat_results in mod_results['categories'].items():
            for file_path, file_info in cat_results['files'].items():
                if file_info.get('exists') and file_info.get('has_variant_tracking'):
                    variant_analysis['has_variant_tracking'] = True
                    variant_analysis['files_with_variants'].append(file_path)
                    
                    if 'variant_ids' in file_info:
                        variant_analysis['unique_variants'].update(file_info['variant_ids'])
                    
                    # Try to load file to get building-variant mapping
                    if file_info.get('path') and file_info['type'] == 'parquet':
                        try:
                            df = pd.read_parquet(file_info['path'])
                            if 'building_id' in df.columns and 'variant_id' in df.columns:
                                for _, row in df[['building_id', 'variant_id']].drop_duplicates().iterrows():
                                    variant_analysis['building_variant_mapping'][row['building_id']].add(row['variant_id'])
                        except:
                            pass
        
        variant_analysis['unique_variants'] = list(variant_analysis['unique_variants'])
        variant_analysis['building_variant_mapping'] = dict(variant_analysis['building_variant_mapping'])
        
        return variant_analysis
    
    def save_results(self, results: Dict, output_dir: Path = None):
        """Save all results."""
        if output_dir is None:
            output_dir = self.base_parsed_dir / 'data_structure_check'
        
        output_dir.mkdir(exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save detailed report
        df_report = self.create_detailed_report(results)
        df_report.to_csv(output_dir / f'structure_check_{timestamp}.csv', index=False)
        df_report.to_excel(output_dir / f'structure_check_{timestamp}.xlsx', index=False)
        
        # Save JSON results
        with open(output_dir / f'structure_check_results_{timestamp}.json', 'w') as f:
            json.dump(self._make_json_serializable(results), f, indent=2)
        
        # Save variant analysis if applicable
        if results.get('modified'):
            variant_analysis = self.analyze_variant_tracking(results)
            with open(output_dir / f'variant_analysis_{timestamp}.json', 'w') as f:
                json.dump(self._make_json_serializable(variant_analysis), f, indent=2)
        
        # Create markdown report
        self.create_markdown_report(output_dir / f'report_{timestamp}.md', results)
        
        print(f"\n{self.COLORS['GREEN']}Results saved to: {output_dir}{self.COLORS['END']}")
        
        return output_dir
    
    def create_markdown_report(self, output_path: Path, results: Dict):
        """Create comprehensive markdown report."""
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("# Parser Data Structure Check Report\n\n")
            f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            # Base results
            if 'base' in results:
                f.write("## Base Parsed Data\n\n")
                self._write_directory_markdown(f, results['base'])
            
            # Modified results
            if results.get('modified'):
                f.write("\n## Modified Parsed Data\n\n")
                self._write_directory_markdown(f, results['modified'])
                
                # Variant analysis
                variant_analysis = self.analyze_variant_tracking(results)
                if variant_analysis['has_variant_tracking']:
                    f.write("\n### Variant Tracking Analysis\n\n")
                    f.write(f"- **Variant tracking enabled:** ✓\n")
                    f.write(f"- **Unique variants:** {len(variant_analysis['unique_variants'])}\n")
                    f.write(f"- **Files with variants:** {len(variant_analysis['files_with_variants'])}\n")
                    
                    if variant_analysis['unique_variants']:
                        f.write("\n**Variants found:**\n")
                        for variant in sorted(variant_analysis['unique_variants']):
                            f.write(f"- {variant}\n")
            
            # Comparison
            if results.get('differences'):
                f.write("\n## Comparison Summary\n\n")
                for category, diffs in results['differences'].items():
                    if any([diffs['base_only'], diffs['modified_only'], diffs['both_missing']]):
                        f.write(f"\n### {category}\n\n")
                        
                        if diffs['base_only']:
                            f.write("**Files only in base:**\n")
                            for file in diffs['base_only']:
                                f.write(f"- {file}\n")
                        
                        if diffs['modified_only']:
                            f.write("\n**Files only in modified:**\n")
                            for file in diffs['modified_only']:
                                f.write(f"- {file}\n")
                        
                        if diffs['both_missing']:
                            f.write("\n**Files missing in both:**\n")
                            for file in diffs['both_missing']:
                                f.write(f"- {file}\n")
    
    def _write_directory_markdown(self, f, dir_results: Dict):
        """Write directory results to markdown."""
        f.write(f"**Path:** `{dir_results['path']}`\n\n")
        
        if not dir_results['exists']:
            f.write("⚠️ **Directory does not exist**\n\n")
            return
        
        summary = dir_results['summary']
        f.write(f"**Summary:** {summary['found']}/{summary['total_expected']} files found\n\n")
        
        if summary['critical_missing'] > 0:
            f.write(f"⚠️ **Critical files missing:** {summary['critical_missing']}\n\n")
        
        # Category table
        f.write("| Category | Status | Found | Total | Critical Missing |\n")
        f.write("|----------|--------|-------|-------|------------------|\n")
        
        for category, cat_results in dir_results['categories'].items():
            found = cat_results['found']
            total = len(cat_results['files'])
            critical = cat_results['critical_missing']
            
            if critical > 0:
                status = "❌"
            elif found == total:
                status = "✅"
            else:
                status = "⚠️"
            
            f.write(f"| {category} | {status} | {found} | {total} | {critical} |\n")
    
    def _make_json_serializable(self, obj):
        """Convert non-serializable objects for JSON."""
        if isinstance(obj, set):
            return list(obj)
        elif isinstance(obj, defaultdict):
            return dict(obj)
        elif isinstance(obj, Path):
            return str(obj)
        elif isinstance(obj, dict):
            return {k: self._make_json_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self._make_json_serializable(item) for item in obj]
        else:
            return obj

# Main function
def check_parser_data_enhanced(base_parsed_dir: str, modified_parsed_dir: str = None):
    """Run enhanced parser data structure check."""
    
    checker = EnhancedParserDataChecker(base_parsed_dir, modified_parsed_dir)
    
    # Run comprehensive check
    if modified_parsed_dir:
        print(f"\nChecking both base and modified parsed data...")
        results = checker.compare_base_and_modified()
    else:
        print(f"\nChecking base parsed data only...")
        results = {'base': checker.check_directory(checker.base_parsed_dir, 'base')}
    
    # Create visual summary
    checker.create_visual_summary(results)
    
    # Analyze variants if modified data exists
    if results.get('modified'):
        variant_analysis = checker.analyze_variant_tracking(results)
        if variant_analysis['has_variant_tracking']:
            print(f"\n{checker.COLORS['BOLD']}VARIANT TRACKING ANALYSIS{checker.COLORS['END']}")
            print("-"*50)
            print(f"Unique variants found: {len(variant_analysis['unique_variants'])}")
            print(f"Files with variant tracking: {len(variant_analysis['files_with_variants'])}")
            
            if variant_analysis['building_variant_mapping']:
                print(f"\nBuildings with variants: {len(variant_analysis['building_variant_mapping'])}")
                for building_id, variants in list(variant_analysis['building_variant_mapping'].items())[:5]:
                    print(f"  {building_id}: {', '.join(variants)}")
                if len(variant_analysis['building_variant_mapping']) > 5:
                    print(f"  ... and {len(variant_analysis['building_variant_mapping']) - 5} more buildings")
    
    # Save results
    output_dir = checker.save_results(results)
    
    # Create detailed report
    df_report = checker.create_detailed_report(results)
    
    return results, df_report

# Usage example
if __name__ == "__main__":
    # Your paths
    base_dir = r"D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff\parsed_data"
    modified_dir = r"D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff\parsed_modified_results"
    
    # Run the check
    results, df_report = check_parser_data_enhanced(base_dir, modified_dir)
    
    # Show critical issues
    print("\n" + "="*100)
    print("CRITICAL ISSUES SUMMARY")
    print("="*100)
    
    critical_df = df_report[df_report['Status'] == 'CRITICAL']
    if not critical_df.empty:
        print(critical_df[['Dataset', 'Category', 'File', 'Description']].to_string(index=False))
    else:
        print("No critical issues found!")


Checking both base and modified parsed data...

[1mPARSER DATA STRUCTURE CHECK - VISUAL SUMMARY[0m

[1mBASE PARSED DATA[0m
Path: D:\Documents\daily\E_Plus_2040_py\output\82e2b83f-5013-4270-8266-a37a67dbd4ff\parsed_data

Overall: 10/11 files found
[[92m█████████████████████████████████████████████[0m[91m▓▓▓▓[0m[93m░[0m]
Legend: [92m█ Found[0m | [91m▓ Critical Missing[0m | [93m░ Optional Missing[0m

Category Status:
  [92m✓[0m metadata             3/3 files
  [92m✓[0m idf_data             4/4 files
  [91m✗[0m sql_results          1/2 files
      [91m↳ Missing: *_2020.parquet[0m
  [92m✓[0m relationships        1/1 files
  [92m✓[0m output_validation    1/1 files

[95mExtra files found: 35[0m
  + parsing_summary.json
  + analysis_ready/feature_sets/extraction_statistics.parquet
  + analysis_ready/feature_sets/missing_variables_detail.parquet
  + analysis_ready/output_analysis/coverage_summary.json
  + idf_data/by_building/4136733_snapshot.parquet
  ... and 30 