## Test new parserr changes

In [None]:
# %% [markdown]
# # EnergyPlus Selective Parsing Test
# This notebook tests the new selective parsing functionality

# %% Import required modules
# %% Import required modules
import os
import sys
import json
import pandas as pd
from pathlib import Path
import importlib.util
import re
from datetime import datetime
import time
import traceback
import warnings
import logging
from typing import Union, List, Dict, Optional, Any, Tuple, Set  # Add typing imports


# Add the project directory to Python path
project_dir = r"D:\Documents\daily\E_Plus_2040_py"
sys.path.insert(0, project_dir)

# Import the parsing modules
from parserr.energyplus_analyzer_main import EnergyPlusAnalyzer
from parserr.helpers import prepare_selective_file_pairs, get_parsed_data_info

# %% Set up paths
job_id = "cd66cc37-c691-4635-83a0-686a3716c8d5"
job_output_dir = os.path.join(project_dir, "output", job_id)
idf_dir = os.path.join(job_output_dir, "output_IDFs")
sql_dir = os.path.join(job_output_dir, "Sim_Results")

print(f"Job output directory: {job_output_dir}")
print(f"IDF directory exists: {os.path.exists(idf_dir)}")
print(f"SQL directory exists: {os.path.exists(sql_dir)}")

# %% List available files
print("\nAvailable IDF files:")
if os.path.exists(idf_dir):
    idf_files = [f for f in os.listdir(idf_dir) if f.endswith('.idf')]
    for f in idf_files[:5]:  # Show first 5
        print(f"  - {f}")
    if len(idf_files) > 5:
        print(f"  ... and {len(idf_files) - 5} more")

print("\nAvailable SQL files:")
if os.path.exists(sql_dir):
    for root, dirs, files in os.walk(sql_dir):
        sql_files = [f for f in files if f.endswith('.sql')]
        if sql_files:
            print(f"  In {os.path.relpath(root, sql_dir)}:")
            for f in sql_files[:5]:
                print(f"    - {f}")

# %% Test 1: Parse everything (simplest config)
print("\n" + "="*60)
print("TEST 1: Parse Everything")
print("="*60)

config_all = {
    "perform_parsing": True,
    "parse_mode": "all"
}

# Create analyzer
analyzer = EnergyPlusAnalyzer(os.path.join(job_output_dir, "parsed_data_test1"))

# Prepare file pairs
file_pairs = prepare_selective_file_pairs(
    job_output_dir=job_output_dir,
    parse_mode="all",
    parse_types={"idf": True, "sql": True},
    building_selection={},
    idf_map_csv=os.path.join(job_output_dir, "extracted_idf_buildings.csv")
)

print(f"Found {len(file_pairs)} file pairs")

# Uncomment to run (this will parse everything)
# analyzer.analyze_project_selective(file_pairs)

# %% Test 2: Parse single building
print("\n" + "="*60)
print("TEST 2: Parse Single Building")
print("="*60)

# Get first building ID from available files
if idf_files:
    first_idf = idf_files[0]
    # Extract building ID from filename
    import re
    match = re.search(r'building_(\d+)', first_idf)
    if match:
        building_id = int(match.group(1))
        print(f"Parsing building ID: {building_id}")
        
        config_single = {
            "parse_mode": "selective",
            "building_selection": {
                "mode": "specific",
                "building_ids": [building_id]
            }
        }
        
        analyzer2 = EnergyPlusAnalyzer(os.path.join(job_output_dir, "parsed_data_test2"))
        
        file_pairs = prepare_selective_file_pairs(
            job_output_dir=job_output_dir,
            parse_mode="selective",
            parse_types={"idf": True, "sql": True},
            building_selection=config_single["building_selection"],
            idf_map_csv=os.path.join(job_output_dir, "extracted_idf_buildings.csv")
        )
        
        print(f"Found {len(file_pairs)} file pairs for building {building_id}")
        
        # Uncomment to run
        # analyzer2.analyze_project_selective(file_pairs)

# %% Test 3: Parse only geometry
print("\n" + "="*60)
print("TEST 3: Parse Only Geometry")
print("="*60)

config_geometry = {
    "idf_content": {
        "mode": "categories_only",
        "categories": ["geometry"]
    },
    "sql_content": {
        "mode": "selective",
        "variables": {
            "mode": "categories",
            "categories": ["geometry"]
        }
    }
}

analyzer3 = EnergyPlusAnalyzer(os.path.join(job_output_dir, "parsed_data_test3"))

file_pairs = prepare_selective_file_pairs(
    job_output_dir=job_output_dir,
    parse_mode="all",
    parse_types={"idf": True, "sql": True},
    building_selection={},
    idf_map_csv=os.path.join(job_output_dir, "extracted_idf_buildings.csv")
)

print(f"Will parse geometry data from {len(file_pairs)} file pairs")

# Uncomment to run
# analyzer3.analyze_project_selective(
#     file_pairs,
#     idf_content_config=config_geometry["idf_content"],
#     sql_content_config=config_geometry["sql_content"]
# )

# %% Test 4: Parse specific files directly
print("\n" + "="*60)
print("TEST 4: Parse Specific Files")
print("="*60)

# Select first 2 IDF files and first SQL file
specific_idf_files = []
specific_sql_files = []

if idf_files:
    for f in idf_files[:2]:
        specific_idf_files.append(os.path.join(idf_dir, f))

# Find SQL files
sql_files_found = []
for root, dirs, files in os.walk(sql_dir):
    for f in files:
        if f.endswith('.sql'):
            sql_files_found.append(os.path.join(root, f))
            if len(sql_files_found) >= 1:
                break

specific_sql_files = sql_files_found[:1]

print(f"Specific IDF files: {[os.path.basename(f) for f in specific_idf_files]}")
print(f"Specific SQL files: {[os.path.basename(f) for f in specific_sql_files]}")

config_specific = {
    "parse_mode": "specific_files",
    "building_selection": {
        "specific_files": {
            "idf": specific_idf_files,
            "sql": specific_sql_files
        }
    }
}

analyzer4 = EnergyPlusAnalyzer(os.path.join(job_output_dir, "parsed_data_test4"))

file_pairs = prepare_selective_file_pairs(
    job_output_dir=job_output_dir,
    parse_mode="specific_files",
    parse_types={"idf": True, "sql": True},
    building_selection=config_specific["building_selection"]
)

print(f"Found {len(file_pairs)} file pairs from specific files")

# Uncomment to run
# analyzer4.analyze_project_selective(file_pairs)

# %% Test 5: Parse only energy variables
print("\n" + "="*60)
print("TEST 5: Parse Only Energy Variables")
print("="*60)

config_energy = {
    "parse_types": {
        "idf": False,  # Skip IDF parsing
        "sql": True
    },
    "sql_content": {
        "mode": "selective",
        "variables": {
            "mode": "pattern",
            "variable_patterns": ["*Energy*", "*Power*"]
        },
        "frequency_filter": ["Hourly", "Daily"]
    }
}

analyzer5 = EnergyPlusAnalyzer(os.path.join(job_output_dir, "parsed_data_test5"))

file_pairs = prepare_selective_file_pairs(
    job_output_dir=job_output_dir,
    parse_mode="all",
    parse_types=config_energy["parse_types"],
    building_selection={},
    idf_map_csv=os.path.join(job_output_dir, "extracted_idf_buildings.csv")
)

print(f"Will parse energy variables from {len(file_pairs)} SQL files")

# Uncomment to run
# analyzer5.analyze_project_selective(
#     file_pairs,
#     sql_content_config=config_energy["sql_content"]
# )

# %% Test 6: Full example with all options
print("\n" + "="*60)
print("TEST 6: Full Configuration Example")
print("="*60)

full_config = {
    "parsing": {
        "perform_parsing": True,
        "parse_mode": "selective",
        "parse_types": {
            "idf": True,
            "sql": True
        },
        "building_selection": {
            "mode": "range",
            "building_range": {"start": 0, "end": 3}
        },
        "idf_content": {
            "mode": "selective",
            "categories": ["geometry", "hvac", "lighting"],
            "exclude_objects": ["SCHEDULE:COMPACT"]
        },
        "sql_content": {
            "mode": "selective",
            "variables": {
                "mode": "categories",
                "categories": ["energy", "comfort"]
            },
            "time_filter": {
                "months": [1, 7]  # January and July only
            },
            "frequency_filter": ["Daily"],
            "components": {
                "timeseries": True,
                "schedules": False,
                "summary_metrics": True
            }
        },
        "output_options": {
            "save_format": "parquet",
            "create_summary": True
        },
        "validation": {
            "validate_before_parsing": True,
            "continue_on_error": True
        }
    }
}

# This shows how it would be used in the full system
print("Full configuration loaded:")
print(f"- Parse mode: {full_config['parsing']['parse_mode']}")
print(f"- Building range: {full_config['parsing']['building_selection']['building_range']}")
print(f"- IDF categories: {full_config['parsing']['idf_content']['categories']}")
print(f"- SQL categories: {full_config['parsing']['sql_content']['variables']['categories']}")

# %% Check results
print("\n" + "="*60)
print("CHECKING PARSED DATA")
print("="*60)

# Check what was parsed in each test
test_dirs = [
    "parsed_data_test1",
    "parsed_data_test2", 
    "parsed_data_test3",
    "parsed_data_test4",
    "parsed_data_test5"
]

for test_dir in test_dirs:
    test_path = os.path.join(job_output_dir, test_dir)
    if os.path.exists(test_path):
        print(f"\n{test_dir}:")
        parsed_info = get_parsed_data_info(test_path)
        print(f"  Categories parsed: {len(parsed_info['categories'])}")
        print(f"  Total buildings: {parsed_info['total_buildings']}")
        print(f"  Total files: {parsed_info['total_files']}")
        
        # Show categories
        if parsed_info['categories']:
            print("  Categories:")
            for cat, info in list(parsed_info['categories'].items())[:5]:
                print(f"    - {cat}: {info['rows']} rows")

# %% Utility function to run any config
def run_parsing_with_config(config_dict, output_name):
    """
    Utility function to run parsing with any configuration
    """
    analyzer = EnergyPlusAnalyzer(os.path.join(job_output_dir, output_name))
    
    # Extract parsing config
    parsing_cfg = config_dict.get("parsing", config_dict)
    
    # Prepare file pairs
    file_pairs = prepare_selective_file_pairs(
        job_output_dir=job_output_dir,
        parse_mode=parsing_cfg.get("parse_mode", "all"),
        parse_types=parsing_cfg.get("parse_types", {"idf": True, "sql": True}),
        building_selection=parsing_cfg.get("building_selection", {}),
        idf_map_csv=os.path.join(job_output_dir, "extracted_idf_buildings.csv")
    )
    
    print(f"Found {len(file_pairs)} file pairs")
    
    # Run analysis
    analyzer.analyze_project_selective(
        file_pairs,
        idf_content_config=parsing_cfg.get("idf_content"),
        sql_content_config=parsing_cfg.get("sql_content"),
        output_options=parsing_cfg.get("output_options"),
        validation_options=parsing_cfg.get("validation")
    )
    
    return analyzer

# Example usage:
analyzer = run_parsing_with_config(full_config, "parsed_data_full_test")