## Parquet Explorer

In [2]:
import os
import pandas as pd
import random
from pathlib import Path
from datetime import datetime

# Set the root directory path
root_dir = r"D:\Documents\daily\E_Plus_2040_py\output\49caf4d5-5dc1-411e-911b-462cf44bfb51\parsed_data\sql_results"

# Initialize lists to store data
all_samples = []
file_info = []

# Walk through all directories and subdirectories
for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:
        if filename.endswith('.parquet'):
            file_path = os.path.join(dirpath, filename)
            relative_path = os.path.relpath(file_path, root_dir)
            
            try:
                # Read the parquet file
                df = pd.read_parquet(file_path)
                
                # Get the number of rows in the file
                total_rows = len(df)
                
                if total_rows > 0:
                    # Randomly sample 3-4 rows
                    sample_size = random.randint(3, 4)
                    sample_size = min(sample_size, total_rows)  # Don't sample more than available
                    
                    # Random sampling
                    sampled_df = df.sample(n=sample_size, random_state=None)
                    
                    # Add file information
                    all_samples.append(f"\n{'='*80}")
                    all_samples.append(f"File: {relative_path}")
                    all_samples.append(f"Total rows: {total_rows}, Sampled: {sample_size} rows")
                    all_samples.append(f"{'='*80}\n")
                    
                    # Convert sampled data to string
                    sample_str = sampled_df.to_string()
                    all_samples.append(sample_str)
                    
                    # Store file info for summary
                    file_info.append({
                        'file': relative_path,
                        'total_rows': total_rows,
                        'sampled_rows': sample_size
                    })
                    
                    print(f"✓ Processed: {relative_path} ({sample_size} rows sampled from {total_rows})")
                else:
                    print(f"⚠ Skipped: {relative_path} (empty file)")
                    
            except Exception as e:
                print(f"✗ Error reading {relative_path}: {str(e)}")
                all_samples.append(f"\n{'='*80}")
                all_samples.append(f"File: {relative_path}")
                all_samples.append(f"ERROR: {str(e)}")
                all_samples.append(f"{'='*80}\n")

# Save results to text file
output_filename = f"parquet_samples_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
output_path = os.path.join(os.getcwd(), output_filename)

with open(output_path, 'w', encoding='utf-8') as f:
    # Write header
    f.write(f"Parquet Files Random Sample Report\n")
    f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Root directory: {root_dir}\n")
    f.write(f"Total files processed: {len(file_info)}\n")
    f.write(f"\n{'='*80}\n")
    
    # Write all samples
    f.write('\n'.join(all_samples))
    
    # Write summary at the end
    f.write(f"\n\n{'='*80}")
    f.write(f"\nSUMMARY")
    f.write(f"\n{'='*80}\n")
    for info in file_info:
        f.write(f"{info['file']}: {info['sampled_rows']} rows sampled from {info['total_rows']} total\n")

print(f"\n✅ Done! Results saved to: {output_path}")
print(f"📊 Total files processed: {len(file_info)}")

# Optional: Display first few lines of the output file
print("\n📄 Preview of output file:")
print("-" * 50)
with open(output_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()[:20]
    for line in lines:
        print(line.rstrip())

✓ Processed: schedules\all_schedules.parquet (4 rows sampled from 39)
✓ Processed: summary_metrics\building_metrics.parquet (3 rows sampled from 3)
✓ Processed: summary_metrics\zone_metrics.parquet (3 rows sampled from 3)
✓ Processed: timeseries\aggregated\daily\hvac_daily.parquet (3 rows sampled from 30375)
✓ Processed: timeseries\aggregated\daily\ventilation_daily.parquet (4 rows sampled from 46116)
✓ Processed: timeseries\aggregated\daily\zones_daily.parquet (3 rows sampled from 156282)
✓ Processed: timeseries\aggregated\monthly\hvac_monthly.parquet (4 rows sampled from 1079)
✓ Processed: timeseries\aggregated\monthly\ventilation_monthly.parquet (4 rows sampled from 1638)
✓ Processed: timeseries\aggregated\monthly\zones_monthly.parquet (3 rows sampled from 5551)
✓ Processed: timeseries\hourly\hvac_2013.parquet (3 rows sampled from 701895)
✓ Processed: timeseries\hourly\ventilation_2013.parquet (4 rows sampled from 1103760)
✓ Processed: timeseries\hourly\zones_2013.parquet (4 rows sa

In [None]:
# Check what's in the parsed_data directory
import os
from pathlib import Path
import pandas as pd

parsed_dir = Path(r"D:\Documents\daily\E_Plus_2040_py\output\da237aa0-dc6b-428b-b2f4-06d4e2905ea3\parsed_data")  # Update this path

# List all parquet files
print("=== PARSED DATA STRUCTURE ===")
for root, dirs, files in os.walk(parsed_dir):
    level = root.replace(str(parsed_dir), '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        if file.endswith('.parquet'):
            print(f"{subindent}{file}")

# Sample category data
print("\n=== SAMPLE CATEGORY DATA ===")
category_file = parsed_dir / "idf_data/by_category/outputs_all.parquet"
if category_file.exists():
    df = pd.read_parquet(category_file)
    print(f"Columns: {list(df.columns)}")
    print(f"Shape: {df.shape}")
    print(df.head())

# Sample timeseries data
print("\n=== SAMPLE TIMESERIES DATA ===")
ts_files = list((parsed_dir / "sql_results/timeseries/hourly").glob("*.parquet"))
if ts_files:
    df = pd.read_parquet(ts_files[0])
    print(f"File: {ts_files[0].name}")
    print(f"Columns: {list(df.columns)}")
    print(f"Shape: {df.shape}")
    print(df.head())

Creating test data in: test_validation_data

Generating EnergyPlus format data...
  Building 4136737
  Building 4136738
Saved: test_validation_data\measured_data_energyplus_format.csv (7288 rows)
Saved: test_validation_data\measured_data_energyplus_format.parquet

Creating validation configuration...
Created validation config: test_validation_data\validation_config_v2.json

DATA GENERATION SUMMARY

Sample data (first 10 rows):
   BuildingID                             VariableName    DateTime  \
0     4136737          Electricity:Facility [J](Daily)  2013-01-01   
1     4136737                     Electricity:Facility  2013-01-01   
2     4136737        Heating:EnergyTransfer [J](Daily)  2013-01-01   
3     4136737  Zone Air System Sensible Heating Energy  2013-01-01   
4     4136737        Cooling:EnergyTransfer [J](Daily)  2013-01-01   
5     4136737  Zone Air System Sensible Cooling Energy  2013-01-01   
6     4136737          Electricity:Facility [J](Daily)  2013-01-02   
7     413