In [2]:
#!/usr/bin/env python3
"""
Diagnostic script to investigate sensitivity analysis data structure issues
"""

import pandas as pd
import numpy as np
from pathlib import Path
import json

def diagnose_sensitivity_data(job_output_dir):
    """Diagnose the data structures causing the sensitivity analysis error"""
    
    job_output_dir = Path(job_output_dir)
    print(f"Analyzing job output directory: {job_output_dir}")
    print("="*80)
    
    # 1. Check modification tracking data
    print("\n1. MODIFICATION TRACKING DATA:")
    mod_files = list((job_output_dir / "modified_idfs").glob("modifications_detail_*.parquet"))
    if mod_files:
        latest_file = max(mod_files, key=lambda x: x.stat().st_mtime)
        print(f"   Loading: {latest_file}")
        mod_df = pd.read_parquet(latest_file)
        print(f"   Shape: {mod_df.shape}")
        print(f"   Columns: {list(mod_df.columns)}")
        print(f"   Building IDs: {sorted(mod_df['building_id'].unique())}")
        print(f"   Data types:\n{mod_df.dtypes}")
        print(f"\n   Sample (first 3 rows):")
        print(mod_df.head(3))
    
    # 2. Check base simulation results
    print("\n\n2. BASE SIMULATION RESULTS:")
    base_path = job_output_dir / "parsed_data/sql_results/timeseries/aggregated/daily"
    if base_path.exists():
        for category in ['hvac', 'energy', 'electricity', 'zones']:
            file_path = base_path / f"{category}_daily.parquet"
            if file_path.exists():
                print(f"\n   {category}_daily.parquet:")
                df = pd.read_parquet(file_path)
                print(f"   Shape: {df.shape}")
                print(f"   Columns: {list(df.columns)[:10]}..." if len(df.columns) > 10 else f"   Columns: {list(df.columns)}")
                print(f"   Index: {df.index.names}, dtype: {df.index.dtype}")
                if 'building_id' in df.columns:
                    print(f"   Building IDs: {sorted(df['building_id'].unique())}")
                    print(f"   building_id dtype: {df['building_id'].dtype}")
                print(f"   First few rows:")
                print(df.head(2))
    
    # 3. Check modified simulation results
    print("\n\n3. MODIFIED SIMULATION RESULTS:")
    mod_path = job_output_dir / "parsed_modified_results/sql_results/timeseries/aggregated/daily"
    if mod_path.exists():
        for category in ['hvac', 'energy', 'electricity', 'zones']:
            file_path = mod_path / f"{category}_daily.parquet"
            if file_path.exists():
                print(f"\n   {category}_daily.parquet:")
                df = pd.read_parquet(file_path)
                print(f"   Shape: {df.shape}")
                print(f"   Index: {df.index.names}, dtype: {df.index.dtype}")
                if 'building_id' in df.columns:
                    print(f"   Building IDs: {sorted(df['building_id'].unique())}")
                    print(f"   building_id dtype: {df['building_id'].dtype}")
    
    # 4. Simulate output delta calculation
    print("\n\n4. SIMULATING OUTPUT DELTA CALCULATION:")
    try:
        # Load sample data to simulate the delta calculation
        base_hvac = None
        mod_hvac = None
        
        base_hvac_path = base_path / "hvac_daily.parquet"
        mod_hvac_path = mod_path / "hvac_daily.parquet"
        
        if base_hvac_path.exists() and mod_hvac_path.exists():
            base_hvac = pd.read_parquet(base_hvac_path)
            mod_hvac = pd.read_parquet(mod_hvac_path)
            
            print(f"   Base HVAC shape: {base_hvac.shape}")
            print(f"   Modified HVAC shape: {mod_hvac.shape}")
            
            # Try to create a sample delta
            if 'building_id' in base_hvac.columns and 'building_id' in mod_hvac.columns:
                # Group by building_id
                base_grouped = base_hvac.groupby('building_id').first()
                mod_grouped = mod_hvac.groupby('building_id').first()
                
                print(f"\n   After grouping:")
                print(f"   Base grouped index: {base_grouped.index.name}, dtype: {base_grouped.index.dtype}")
                print(f"   Mod grouped index: {mod_grouped.index.name}, dtype: {mod_grouped.index.dtype}")
                
                # Create a sample delta dataframe
                delta_records = []
                for building_id in base_grouped.index:
                    if building_id in mod_grouped.index:
                        delta_records.append({
                            'building_id': building_id,
                            'test_delta': 100.0
                        })
                
                if delta_records:
                    delta_df = pd.DataFrame(delta_records)
                    print(f"\n   Sample delta DataFrame:")
                    print(f"   Shape: {delta_df.shape}")
                    print(f"   Columns: {list(delta_df.columns)}")
                    print(f"   Index: {delta_df.index.name}, dtype: {delta_df.index.dtype}")
                    print(f"   building_id dtype: {delta_df['building_id'].dtype}")
                    print(f"   Data:")
                    print(delta_df)
                    
                    # Test the problematic comparison
                    test_building_id = delta_df['building_id'].iloc[0]
                    print(f"\n   Testing comparison with building_id = {test_building_id} (type: {type(test_building_id)})")
                    try:
                        # This is what causes the error
                        mask = delta_df['building_id'] == test_building_id
                        print(f"   Comparison successful! Mask shape: {mask.shape}")
                        filtered = delta_df[mask]
                        print(f"   Filtered shape: {filtered.shape}")
                    except Exception as e:
                        print(f"   ERROR during comparison: {e}")
                        print(f"   Delta df index: {delta_df.index}")
                        print(f"   Building_id series index: {delta_df['building_id'].index}")
                        
    except Exception as e:
        print(f"   Error during delta simulation: {e}")
        import traceback
        traceback.print_exc()
    
    # 5. Check zone mappings
    print("\n\n5. ZONE MAPPINGS:")
    zone_map_path = job_output_dir / "parsed_data/relationships/zone_mappings.parquet"
    if zone_map_path.exists():
        zone_df = pd.read_parquet(zone_map_path)
        print(f"   Shape: {zone_df.shape}")
        print(f"   Columns: {list(zone_df.columns)}")
        print(f"   Sample:")
        print(zone_df.head(3))
    
    # 6. Configuration check
    print("\n\n6. SENSITIVITY CONFIGURATION:")
    print("   Multi-level analysis: Enabled")
    print("   Cross-level analysis: Enabled")
    print("   Output variables: Heating, Cooling, Electricity, Zone Air Temp, Zone Heating/Cooling")
    print("   Method: elasticity")
    print("   Aggregation: daily, sum")

if __name__ == "__main__":
    # Replace with your actual job output directory
    job_dir = r"D:\Documents\daily\E_Plus_2040_py\output\bfbc7c27-d9df-4e32-b8fe-b627ed32189e"
    diagnose_sensitivity_data(job_dir)

Analyzing job output directory: D:\Documents\daily\E_Plus_2040_py\output\bfbc7c27-d9df-4e32-b8fe-b627ed32189e

1. MODIFICATION TRACKING DATA:
   Loading: D:\Documents\daily\E_Plus_2040_py\output\bfbc7c27-d9df-4e32-b8fe-b627ed32189e\modified_idfs\modifications_detail_20250623_174326.parquet
   Shape: (363, 10)
   Columns: ['building_id', 'variant_id', 'category', 'object_type', 'object_name', 'field_name', 'original_value', 'new_value', 'change_type', 'timestamp']
   Building IDs: ['4136733', '4136737', '4136738']
   Data types:
building_id       object
variant_id        object
category          object
object_type       object
object_name       object
field_name        object
original_value    object
new_value         object
change_type       object
timestamp         object
dtype: object

   Sample (first 3 rows):
  building_id variant_id  category object_type       object_name field_name  \
0     4136733  variant_0  lighting      LIGHTS  Lights_ALL_ZONES              
1     4136733  va

In [3]:
import os
import json
import pandas as pd
from pathlib import Path
import glob

def explore_directory_contents(base_path="output/2dae68d1-4b75-4c75-9471-7b6d931af865"):
    """Explore and display contents of analysis output directories"""
    
    directories = {
        "modified_idfs": "Modified IDF files",
        "Modified_Sim_Results": "Modified simulation results",
        "output_IDFs": "Output IDF files",
        "parsed_data": "Parsed data from original simulations",
        "parsed_modified_results": "Parsed modified simulation results",
        "sensitivity_results": "Sensitivity analysis results",
        "Sim_Results": "Original simulation results",
        "validation_results": "Validation results"
    }
    
    results = {}
    
    for dir_name, description in directories.items():
        dir_path = Path(base_path) / dir_name
        print(f"\n{'='*60}")
        print(f"Directory: {dir_name}")
        print(f"Description: {description}")
        print(f"Path: {dir_path}")
        print(f"{'='*60}")
        
        if not dir_path.exists():
            print(f"Directory does not exist!")
            continue
            
        # List files in directory
        files = list(dir_path.iterdir())
        print(f"\nNumber of files: {len(files)}")
        
        # Group files by extension
        file_types = {}
        for file in files:
            if file.is_file():
                ext = file.suffix.lower()
                if ext not in file_types:
                    file_types[ext] = []
                file_types[ext].append(file.name)
        
        print("\nFile types found:")
        for ext, file_list in file_types.items():
            print(f"  {ext or 'no extension'}: {len(file_list)} files")
            # Show first 3 files as examples
            for i, fname in enumerate(file_list[:3]):
                print(f"    - {fname}")
            if len(file_list) > 3:
                print(f"    ... and {len(file_list) - 3} more")
        
        # Try to read and display sample content
        if dir_name == "sensitivity_results":
            print("\n--- SENSITIVITY RESULTS SAMPLE ---")
            csv_files = list(dir_path.glob("*.csv"))
            if csv_files:
                df = pd.read_csv(csv_files[0])
                print(f"\nFile: {csv_files[0].name}")
                print(f"Shape: {df.shape}")
                print(f"\nColumns: {list(df.columns)}")
                print(f"\nFirst 5 rows:")
                print(df.head())
                
                # Show unique values for key columns
                if 'parameter' in df.columns:
                    print(f"\nUnique parameters ({len(df['parameter'].unique())}):")
                    for param in sorted(df['parameter'].unique())[:10]:
                        print(f"  - {param}")
                    if len(df['parameter'].unique()) > 10:
                        print(f"  ... and {len(df['parameter'].unique()) - 10} more")
                
                if 'output_variable' in df.columns:
                    print(f"\nUnique output variables:")
                    for var in df['output_variable'].unique():
                        print(f"  - {var}")
        
        elif dir_name == "parsed_data" or dir_name == "parsed_modified_results":
            print(f"\n--- {dir_name.upper()} SAMPLE ---")
            csv_files = list(dir_path.glob("*.csv"))
            if csv_files:
                df = pd.read_csv(csv_files[0])
                print(f"\nFile: {csv_files[0].name}")
                print(f"Shape: {df.shape}")
                print(f"\nColumns: {list(df.columns)}")
                print(f"\nFirst 5 rows:")
                print(df.head())
                
                # Check if it has time data
                time_cols = [col for col in df.columns if 'time' in col.lower() or 'date' in col.lower()]
                if time_cols:
                    print(f"\nTime columns found: {time_cols}")
        
        elif dir_name == "modified_idfs" or dir_name == "output_IDFs":
            print(f"\n--- {dir_name.upper()} SAMPLE ---")
            idf_files = list(dir_path.glob("*.idf"))
            if idf_files:
                print(f"\nReading first 50 lines of: {idf_files[0].name}")
                with open(idf_files[0], 'r') as f:
                    lines = f.readlines()[:50]
                    for i, line in enumerate(lines):
                        if line.strip() and not line.strip().startswith('!'):
                            print(f"{i:3d}: {line.rstrip()}")
        
        elif dir_name == "validation_results":
            print("\n--- VALIDATION RESULTS SAMPLE ---")
            # Try JSON files first
            json_files = list(dir_path.glob("*.json"))
            if json_files:
                with open(json_files[0], 'r') as f:
                    data = json.load(f)
                print(f"\nFile: {json_files[0].name}")
                print(f"JSON structure:")
                print(json.dumps(data, indent=2)[:500] + "...")
            
            # Try CSV files
            csv_files = list(dir_path.glob("*.csv"))
            if csv_files:
                df = pd.read_csv(csv_files[0])
                print(f"\nFile: {csv_files[0].name}")
                print(f"Shape: {df.shape}")
                print(df.head())

def analyze_parameter_structure(base_path="output/2dae68d1-4b75-4c75-9471-7b6d931af865"):
    """Analyze the parameter structure from sensitivity results"""
    
    sensitivity_path = Path(base_path) / "sensitivity_results"
    if not sensitivity_path.exists():
        print("Sensitivity results directory not found!")
        return
    
    print("\n" + "="*60)
    print("PARAMETER STRUCTURE ANALYSIS")
    print("="*60)
    
    # Read all CSV files
    all_params = set()
    param_examples = {}
    
    for csv_file in sensitivity_path.glob("*.csv"):
        df = pd.read_csv(csv_file)
        if 'parameter' in df.columns:
            for param in df['parameter'].unique():
                all_params.add(param)
                if param not in param_examples:
                    # Get an example row for this parameter
                    example_row = df[df['parameter'] == param].iloc[0].to_dict()
                    param_examples[param] = example_row
    
    # Analyze parameter patterns
    print(f"\nTotal unique parameters: {len(all_params)}")
    
    # Group parameters by pattern
    param_groups = {
        'materials': [],
        'schedules': [],
        'equipment': [],
        'constructions': [],
        'zones': [],
        'other': []
    }
    
    for param in sorted(all_params):
        param_lower = param.lower()
        if 'material' in param_lower:
            param_groups['materials'].append(param)
        elif 'schedule' in param_lower:
            param_groups['schedules'].append(param)
        elif any(equip in param_lower for equip in ['coil', 'fan', 'pump', 'chiller', 'boiler']):
            param_groups['equipment'].append(param)
        elif 'construction' in param_lower:
            param_groups['constructions'].append(param)
        elif 'zone' in param_lower:
            param_groups['zones'].append(param)
        else:
            param_groups['other'].append(param)
    
    # Display grouped parameters
    for group, params in param_groups.items():
        if params:
            print(f"\n{group.upper()} ({len(params)} parameters):")
            for param in params[:5]:
                print(f"  - {param}")
                if param in param_examples:
                    example = param_examples[param]
                    print(f"    Example: zone={example.get('zone_name', 'N/A')}, "
                          f"score={example.get('sensitivity_score', 'N/A'):.2e}, "
                          f"p_value={example.get('p_value', 'N/A')}")
            if len(params) > 5:
                print(f"  ... and {len(params) - 5} more")

def analyze_time_structure(base_path="output/2dae68d1-4b75-4c75-9471-7b6d931af865"):
    """Analyze the time structure in simulation results"""
    
    print("\n" + "="*60)
    print("TIME STRUCTURE ANALYSIS")
    print("="*60)
    
    # Check parsed data for time information
    parsed_path = Path(base_path) / "parsed_data"
    if parsed_path.exists():
        csv_files = list(parsed_path.glob("*.csv"))
        if csv_files:
            df = pd.read_csv(csv_files[0], nrows=1000)  # Read first 1000 rows
            print(f"\nAnalyzing time structure in: {csv_files[0].name}")
            
            # Look for time-related columns
            time_cols = [col for col in df.columns if any(t in col.lower() for t in ['time', 'date', 'month', 'hour', 'day'])]
            
            if time_cols:
                print(f"\nTime columns found: {time_cols}")
                for col in time_cols:
                    print(f"\n{col}:")
                    print(f"  Sample values: {df[col].head(10).tolist()}")
                    print(f"  Unique values: {df[col].nunique()}")
            
            # If numeric columns exist, check if they represent time series
            numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
            if len(numeric_cols) > 0:
                print(f"\nNumeric columns that might be time series: {len(numeric_cols)}")
                print(f"Data shape suggests {len(df)} time steps")

if __name__ == "__main__":
    # Update this path to match your actual output directory
    base_path = "output/2dae68d1-4b75-4c75-9471-7b6d931af865"
    
    print("ENERGYPLUS SENSITIVITY ANALYSIS OUTPUT EXPLORER")
    print("=" * 60)
    
    # Explore all directories
    explore_directory_contents(base_path)
    
    # Analyze parameter structure
    analyze_parameter_structure(base_path)
    
    # Analyze time structure
    analyze_time_structure(base_path)
    
    print("\n" + "="*60)
    print("ANALYSIS COMPLETE")
    print("="*60)

ENERGYPLUS SENSITIVITY ANALYSIS OUTPUT EXPLORER

Directory: modified_idfs
Description: Modified IDF files
Path: output\2dae68d1-4b75-4c75-9471-7b6d931af865\modified_idfs

Number of files: 12

File types found:
  .idf: 9 files
    - building_4136733_variant_0.idf
    - building_4136733_variant_1.idf
    - building_4136733_variant_2.idf
    ... and 6 more
  .parquet: 2 files
    - modifications_detail_20250623_182931.parquet
    - modifications_summary_20250623_182931.parquet
  .json: 1 files
    - modification_report_20250623_182931.json

--- MODIFIED_IDFS SAMPLE ---

Reading first 50 lines of: building_4136733_variant_0.idf
  7: Version,
  8:     22.2;  !- Version Identifier, Version Identifier
 10: SimulationControl,
 11:     Yes,  !- Do Zone Sizing Calculation, Do Zone Sizing Calculation
 12:     Yes,  !- Do System Sizing Calculation, Do System Sizing Calculation
 13:     Yes,  !- Do Plant Sizing Calculation, Do Plant Sizing Calculation
 14:     No,  !- Run Simulation for Sizing Peri

In [4]:
import os
import json
import pandas as pd
import sqlite3
from pathlib import Path
import glob
from collections import defaultdict

def analyze_parameter_structure(base_path):
    """Analyze how parameters are currently structured"""
    print("\n" + "="*80)
    print("PARAMETER STRUCTURE ANALYSIS")
    print("="*80)
    
    # Look for parameter files in parsed_data
    param_files = glob.glob(os.path.join(base_path, "parsed_data", "**/*param*.json"), recursive=True)
    param_files += glob.glob(os.path.join(base_path, "parsed_data", "**/*extracted*.json"), recursive=True)
    
    parameter_types = defaultdict(list)
    
    for param_file in param_files:
        print(f"\nAnalyzing: {param_file}")
        try:
            with open(param_file, 'r') as f:
                data = json.load(f)
            
            # Analyze structure
            if isinstance(data, dict):
                for key, value in data.items():
                    if 'material' in key.lower():
                        print(f"\n  Material parameters found:")
                        if isinstance(value, dict):
                            for mat_name, mat_props in list(value.items())[:3]:
                                print(f"    {mat_name}: {list(mat_props.keys()) if isinstance(mat_props, dict) else type(mat_props)}")
                                parameter_types['materials'].append(mat_props)
                    
                    elif 'construction' in key.lower():
                        print(f"\n  Construction parameters found:")
                        if isinstance(value, dict):
                            for const_name, const_props in list(value.items())[:3]:
                                print(f"    {const_name}: {const_props}")
                                parameter_types['constructions'].append(const_props)
                    
                    elif 'schedule' in key.lower():
                        print(f"\n  Schedule parameters found:")
                        print(f"    Type: {type(value)}, Length: {len(value) if hasattr(value, '__len__') else 'N/A'}")
                        parameter_types['schedules'].append(value)
                        
        except Exception as e:
            print(f"  Error: {e}")
    
    # Look for sensitivity results to see current parameter categorization
    sensitivity_files = glob.glob(os.path.join(base_path, "sensitivity_results", "**/*.json"), recursive=True)
    
    print("\n\nCurrent Parameter Categories in Sensitivity Results:")
    unique_params = set()
    
    for sens_file in sensitivity_files[:5]:
        try:
            with open(sens_file, 'r') as f:
                data = json.load(f)
            
            if isinstance(data, list):
                for item in data[:10]:
                    if isinstance(item, dict) and 'parameter' in item:
                        param = item['parameter']
                        category = item.get('category', 'unknown')
                        unique_params.add((category, param))
        except:
            pass
    
    # Group by category
    by_category = defaultdict(list)
    for category, param in unique_params:
        by_category[category].append(param)
    
    for category, params in by_category.items():
        print(f"\n  {category}:")
        for param in params[:5]:
            print(f"    {param}")

def analyze_time_resolution(base_path):
    """Analyze available time resolution in simulation results"""
    print("\n" + "="*80)
    print("TIME RESOLUTION ANALYSIS")
    print("="*80)
    
    # Look for simulation result files
    result_files = glob.glob(os.path.join(base_path, "Sim_Results", "**/*.csv"), recursive=True)
    result_files += glob.glob(os.path.join(base_path, "Modified_Sim_Results", "**/*.csv"), recursive=True)
    
    time_resolutions = {}
    
    for result_file in result_files[:5]:
        print(f"\nAnalyzing: {os.path.basename(result_file)}")
        try:
            # Read first few rows to understand structure
            df = pd.read_csv(result_file, nrows=100)
            
            # Look for time columns
            time_cols = [col for col in df.columns if any(t in col.lower() for t in ['time', 'date', 'hour', 'month', 'day', 'minute'])]
            
            if time_cols:
                print(f"  Time columns found: {time_cols}")
                
                # Analyze time data
                for col in time_cols:
                    if pd.api.types.is_numeric_dtype(df[col]):
                        print(f"    {col}: numeric, range {df[col].min()} - {df[col].max()}")
                    else:
                        print(f"    {col}: {df[col].dtype}, samples: {df[col].head(3).tolist()}")
                        
                        # Try to parse as datetime
                        try:
                            df[col] = pd.to_datetime(df[col])
                            time_diff = df[col].diff().dropna()
                            if len(time_diff) > 0:
                                common_diff = time_diff.mode()[0]
                                print(f"      Time resolution: {common_diff}")
                                time_resolutions[result_file] = common_diff
                        except:
                            pass
            
            # Check for EnergyPlus standard columns
            if 'Date/Time' in df.columns:
                print("  EnergyPlus Date/Time column found")
                sample = df['Date/Time'].head(5).tolist()
                print(f"    Samples: {sample}")
            
            # Show all columns for context
            print(f"\n  All columns ({len(df.columns)}):")
            for i, col in enumerate(df.columns):
                if i < 10 or 'zone' in col.lower() or 'time' in col.lower():
                    print(f"    {col}")
            if len(df.columns) > 10:
                print(f"    ... and {len(df.columns) - 10} more columns")
                
        except Exception as e:
            print(f"  Error: {e}")
    
    return time_resolutions

def analyze_idf_parameters(base_path):
    """Analyze how parameters are modified in IDFs"""
    print("\n" + "="*80)
    print("IDF PARAMETER MODIFICATION ANALYSIS")  
    print("="*80)
    
    # Compare original and modified IDFs
    original_idfs = glob.glob(os.path.join(base_path, "output_IDFs", "*.idf"), recursive=True)
    modified_idfs = glob.glob(os.path.join(base_path, "modified_idfs", "*.idf"), recursive=True)
    
    if original_idfs and modified_idfs:
        print(f"\nFound {len(original_idfs)} original and {len(modified_idfs)} modified IDFs")
        
        # Analyze first pair
        if original_idfs and modified_idfs:
            orig_file = original_idfs[0]
            # Try to find corresponding modified file
            orig_name = os.path.basename(orig_file)
            
            print(f"\nComparing: {orig_name}")
            
            # Read both files and look for differences
            try:
                with open(orig_file, 'r') as f:
                    orig_content = f.read()
                
                # Find a modified version
                for mod_file in modified_idfs[:5]:
                    with open(mod_file, 'r') as f:
                        mod_content = f.read()
                    
                    if len(mod_content) != len(orig_content):
                        print(f"\n  Modified file: {os.path.basename(mod_file)}")
                        print(f"  Size difference: {len(mod_content) - len(orig_content)} characters")
                        
                        # Look for specific changes in materials
                        orig_materials = extract_idf_objects(orig_content, 'Material')
                        mod_materials = extract_idf_objects(mod_content, 'Material')
                        
                        if len(orig_materials) != len(mod_materials):
                            print(f"  Material count: {len(orig_materials)} -> {len(mod_materials)}")
                        
                        # Show sample differences
                        for obj_type in ['Material', 'Construction', 'Schedule']:
                            orig_objs = extract_idf_objects(orig_content, obj_type)
                            mod_objs = extract_idf_objects(mod_content, obj_type)
                            if orig_objs != mod_objs:
                                print(f"\n  {obj_type} changes detected")
                                break
                        
                        break
                        
            except Exception as e:
                print(f"  Error comparing: {e}")

def extract_idf_objects(content, object_type):
    """Extract objects of a specific type from IDF content"""
    objects = []
    lines = content.split('\n')
    in_object = False
    current_object = []
    
    for line in lines:
        line = line.strip()
        if line.startswith(object_type + ','):
            in_object = True
            current_object = [line]
        elif in_object:
            current_object.append(line)
            if line.endswith(';'):
                objects.append('\n'.join(current_object))
                in_object = False
                current_object = []
    
    return objects

def summarize_findings(base_path):
    """Summarize key findings for planning enhancements"""
    print("\n" + "="*80)
    print("SUMMARY OF FINDINGS")
    print("="*80)
    
    print("\n1. CURRENT PARAMETER STRUCTURE:")
    print("   - Parameters appear to be grouped by high-level categories")
    print("   - Need to extract specific sub-parameters like:")
    print("     * Material properties: conductivity, density, specific heat")
    print("     * Construction layers: individual layer properties")
    print("     * Schedule values: time-specific values")
    
    print("\n2. TIME RESOLUTION AVAILABLE:")
    print("   - Check simulation output files for hourly/sub-hourly data")
    print("   - EnergyPlus typically provides hourly or sub-hourly resolution")
    print("   - Need to identify time columns for filtering")
    
    print("\n3. REQUIRED ENHANCEMENTS:")
    print("   a) Parameter Extraction:")
    print("      - Parse IDF objects to extract individual properties")
    print("      - Create mapping of high-level to sub-parameters")
    print("      - Store parameter hierarchy in database")
    print("   ")
    print("   b) Time-Slice Analysis:")
    print("      - Add time filtering to sensitivity calculation")
    print("      - Define time periods (peak hours, months, day types)")
    print("      - Store time-specific sensitivity results")
    
    print("\n4. FILES TO UPDATE:")
    print("   - parameter_extractor.py: Add sub-parameter extraction")
    print("   - sensitivity_analysis.py: Add time filtering")
    print("   - database schema: Add time dimension to results")
    print("   - API endpoints: Add time-slice query parameters")

def main():
    """Run all analyses"""
    base_path = "output/2dae68d1-4b75-4c75-9471-7b6d931af865"
    
    # Run analyses
    analyze_parameter_structure(base_path)
    time_resolutions = analyze_time_resolution(base_path)
    analyze_idf_parameters(base_path)
    summarize_findings(base_path)
    
    print("\n" + "="*80)
    print("ANALYSIS COMPLETE - Please run and share the output!")
    print("="*80)

if __name__ == "__main__":
    main()


PARAMETER STRUCTURE ANALYSIS


Current Parameter Categories in Sensitivity Results:

TIME RESOLUTION ANALYSIS

Analyzing: simulation_bldg0_4136733.csv
  Time columns found: ['Date/Time', 'Environment:Site Outdoor Air Drybulb Temperature [C](Monthly)', 'Environment:Site Outdoor Air Relative Humidity [%](Monthly)', 'Environment:Site Diffuse Solar Radiation Rate per Area [W/m2](Monthly)', 'ZONE1_FRONTPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE1_RIGHTPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE1_REARPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE1_LEFTPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE1_CORE:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE2_FRONTPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE2_RIGHTPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE2_REARPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE2_LEFTPERIMETER:Zone Mean Radiant Temperature [C](Monthly)', 'ZONE2_CORE:Zone Mean

  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])


In [6]:
import os
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

def explore_parquet_files(root_dir):
    """
    Traverse directory tree to find parquet files and display their info.
    
    Args:
        root_dir (str): Root directory path to start searching from
    """
    root_path = Path(root_dir)
    
    # Check if the directory exists
    if not root_path.exists():
        print(f"Error: Directory '{root_dir}' does not exist.")
        return
    
    # Counter for found files
    file_count = 0
    
    print(f"Scanning directory: {root_dir}")
    print("-" * 80)
    
    # Walk through all subdirectories
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            # Check if file has .parquet extension
            if filename.lower().endswith('.parquet'):
                file_count += 1
                full_path = Path(dirpath) / filename
                
                # Get relative path from root directory
                try:
                    relative_path = full_path.relative_to(root_path)
                except ValueError:
                    relative_path = full_path
                
                print(f"\nFile #{file_count}")
                print(f"Relative Path: {relative_path}")
                
                try:
                    # Read the parquet file
                    df = pd.read_parquet(full_path, engine='pyarrow')
                    
                    # Display file info
                    print(f"Shape: {df.shape} (rows: {len(df)}, columns: {len(df.columns)})")
                    print(f"Columns: {list(df.columns)}")
                    
                    # Display first row
                    if len(df) > 0:
                        print("\nFirst row data:")
                        first_row = df.iloc[0]
                        for col, value in first_row.items():
                            print(f"  {col}: {value}")
                    else:
                        print("  (File is empty)")
                        
                except Exception as e:
                    print(f"  Error reading file: {str(e)}")
                
                print("-" * 80)
    
    if file_count == 0:
        print("No parquet files found in the specified directory.")
    else:
        print(f"\nTotal parquet files found: {file_count}")


def explore_parquet_files_detailed(root_dir, max_rows_preview=5):
    """
    Enhanced version with more details and options.
    
    Args:
        root_dir (str): Root directory path to start searching from
        max_rows_preview (int): Number of rows to preview (default: 5)
    """
    root_path = Path(root_dir)
    
    if not root_path.exists():
        print(f"Error: Directory '{root_dir}' does not exist.")
        return
    
    parquet_files = []
    
    # Collect all parquet files first
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.lower().endswith('.parquet'):
                full_path = Path(dirpath) / filename
                relative_path = full_path.relative_to(root_path)
                parquet_files.append((full_path, relative_path))
    
    if not parquet_files:
        print("No parquet files found.")
        return
    
    print(f"Found {len(parquet_files)} parquet file(s)")
    print("=" * 80)
    
    for idx, (full_path, relative_path) in enumerate(parquet_files, 1):
        print(f"\n[{idx}/{len(parquet_files)}] {relative_path}")
        print("-" * 40)
        
        try:
            # Get file size
            file_size = full_path.stat().st_size / (1024 * 1024)  # Convert to MB
            print(f"File size: {file_size:.2f} MB")
            
            # Read parquet metadata without loading all data
            parquet_file = pq.ParquetFile(full_path)
            metadata = parquet_file.metadata
            
            print(f"Number of row groups: {metadata.num_row_groups}")
            print(f"Total rows: {metadata.num_rows}")
            
            # Read just the first few rows
            df_preview = pd.read_parquet(full_path, engine='pyarrow').head(max_rows_preview)
            
            print(f"\nColumn info:")
            for col in df_preview.columns:
                dtype = df_preview[col].dtype
                print(f"  - {col}: {dtype}")
            
            print(f"\nFirst {min(len(df_preview), max_rows_preview)} row(s):")
            print(df_preview.to_string())
            
        except Exception as e:
            print(f"Error processing file: {str(e)}")


# Main execution
if __name__ == "__main__":
    # Your directory path
    directory_path = r"D:\Documents\daily\E_Plus_2040_py\output\2dae68d1-4b75-4c75-9471-7b6d931af865"
    
    # Run the basic explorer
    print("=== BASIC PARQUET FILE EXPLORER ===\n")
    explore_parquet_files(directory_path)
    
    # Uncomment below to run the detailed explorer instead
    # print("\n\n=== DETAILED PARQUET FILE EXPLORER ===\n")
    # explore_parquet_files_detailed(directory_path, max_rows_preview=3)

=== BASIC PARQUET FILE EXPLORER ===

Scanning directory: D:\Documents\daily\E_Plus_2040_py\output\2dae68d1-4b75-4c75-9471-7b6d931af865
--------------------------------------------------------------------------------

File #1
Relative Path: modified_idfs\modifications_detail_20250623_182931.parquet
Shape: (363, 10) (rows: 363, columns: 10)
Columns: ['building_id', 'variant_id', 'category', 'object_type', 'object_name', 'field_name', 'original_value', 'new_value', 'change_type', 'timestamp']

First row data:
  building_id: 4136733
  variant_id: variant_0
  category: lighting
  object_type: LIGHTS
  object_name: Lights_ALL_ZONES
  field_name: 
  original_value: 0.1
  new_value: 0.2
  change_type: absolute
  timestamp: 2025-06-23T18:29:31.991154
--------------------------------------------------------------------------------

File #2
Relative Path: modified_idfs\modifications_summary_20250623_182931.parquet
Shape: (9, 7) (rows: 9, columns: 7)
Columns: ['building_id', 'variant_id', 'success

In [9]:
import os
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

def explore_parquet_files(root_dir):
    """
    Traverse directory tree to find parquet files and display their info.
    
    Args:
        root_dir (str): Root directory path to start searching from
    """
    root_path = Path(root_dir)
    
    # Check if the directory exists
    if not root_path.exists():
        print(f"Error: Directory '{root_dir}' does not exist.")
        return
    
    # Counter for found files
    file_count = 0
    
    print(f"Scanning directory: {root_dir}")
    print("-" * 80)
    
    # Walk through all subdirectories
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            # Check if file has .parquet extension
            if filename.lower().endswith('.parquet'):
                file_count += 1
                full_path = Path(dirpath) / filename
                
                # Get relative path from root directory
                try:
                    relative_path = full_path.relative_to(root_path)
                except ValueError:
                    relative_path = full_path
                
                print(f"\nFile #{file_count}")
                print(f"Relative Path: {relative_path}")
                
                try:
                    # Read the parquet file
                    df = pd.read_parquet(full_path, engine='pyarrow')
                    
                    # Display file info
                    print(f"Shape: {df.shape} (rows: {len(df)}, columns: {len(df.columns)})")
                    
                    # Display first row
                    if len(df) > 0:
                        print("\nFirst row data:")
                        first_row = df.iloc[0]
                        for col, value in first_row.items():
                            dtype = df[col].dtype
                            print(f"  {col} ({dtype}): {value}")
                    else:
                        print("  (File is empty)")
                        
                except Exception as e:
                    print(f"  Error reading file: {str(e)}")
                
                print("-" * 80)
    
    if file_count == 0:
        print("No parquet files found in the specified directory.")
    else:
        print(f"\nTotal parquet files found: {file_count}")


def explore_parquet_files_detailed(root_dir, max_rows_preview=5):
    """
    Enhanced version with more details and options.
    
    Args:
        root_dir (str): Root directory path to start searching from
        max_rows_preview (int): Number of rows to preview (default: 5)
    """
    root_path = Path(root_dir)
    
    if not root_path.exists():
        print(f"Error: Directory '{root_dir}' does not exist.")
        return
    
    parquet_files = []
    
    # Collect all parquet files first
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.lower().endswith('.parquet'):
                full_path = Path(dirpath) / filename
                relative_path = full_path.relative_to(root_path)
                parquet_files.append((full_path, relative_path))
    
    if not parquet_files:
        print("No parquet files found.")
        return
    
    print(f"Found {len(parquet_files)} parquet file(s)")
    print("=" * 80)
    
    for idx, (full_path, relative_path) in enumerate(parquet_files, 1):
        print(f"\n[{idx}/{len(parquet_files)}] {relative_path}")
        print("-" * 40)
        
        try:
            # Get file size
            file_size = full_path.stat().st_size / (1024 * 1024)  # Convert to MB
            print(f"File size: {file_size:.2f} MB")
            
            # Read parquet metadata without loading all data
            parquet_file = pq.ParquetFile(full_path)
            metadata = parquet_file.metadata
            
            print(f"Number of row groups: {metadata.num_row_groups}")
            print(f"Total rows: {metadata.num_rows}")
            
            # Read just the first few rows
            df_preview = pd.read_parquet(full_path, engine='pyarrow').head(max_rows_preview)
            
            print(f"\nColumn info:")
            for col in df_preview.columns:
                dtype = df_preview[col].dtype
                print(f"  - {col}: {dtype}")
            
            print(f"\nFirst {min(len(df_preview), max_rows_preview)} row(s):")
            print(df_preview.to_string())
            
        except Exception as e:
            print(f"Error processing file: {str(e)}")


# Main execution
if __name__ == "__main__":
    # Your directory path
    directory_path = r"D:\Documents\daily\E_Plus_2040_py\output\2dae68d1-4b75-4c75-9471-7b6d931af865"
    
    # Run the basic explorer
    print("=== BASIC PARQUET FILE EXPLORER ===\n")
    explore_parquet_files(directory_path)
    
    # Uncomment below to run the detailed explorer instead
    print("\n\n=== DETAILED PARQUET FILE EXPLORER ===\n")
    # explore_parquet_files_detailed(directory_path, max_rows_preview=3)

=== BASIC PARQUET FILE EXPLORER ===

Scanning directory: D:\Documents\daily\E_Plus_2040_py\output\2dae68d1-4b75-4c75-9471-7b6d931af865
--------------------------------------------------------------------------------

File #1
Relative Path: modified_idfs\modifications_detail_20250623_182931.parquet
Shape: (363, 10) (rows: 363, columns: 10)

First row data:
  building_id (object): 4136733
  variant_id (object): variant_0
  category (object): lighting
  object_type (object): LIGHTS
  object_name (object): Lights_ALL_ZONES
  field_name (object): 
  original_value (object): 0.1
  new_value (object): 0.2
  change_type (object): absolute
  timestamp (object): 2025-06-23T18:29:31.991154
--------------------------------------------------------------------------------

File #2
Relative Path: modified_idfs\modifications_summary_20250623_182931.parquet
Shape: (9, 7) (rows: 9, columns: 7)

First row data:
  building_id (object): 4136733
  variant_id (object): variant_0
  success (bool): True
  outp