## 1. Setup and Initialization

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

print("Libraries loaded successfully")
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Define data paths
BASE_PATH = r'../data/raw/building-data-genome-project-2/data'
METADATA_PATH = os.path.join(BASE_PATH, 'metadata/metadata.csv')
WEATHER_PATH = os.path.join(BASE_PATH, 'weather/weather.csv')
METERS_PATH = os.path.join(BASE_PATH, 'meters/raw')

# Verify paths exist
print("Checking data paths...")
print(f"Base path exists: {os.path.exists(BASE_PATH)}")
print(f"Metadata exists: {os.path.exists(METADATA_PATH)}")
print(f"Weather exists: {os.path.exists(WEATHER_PATH)}")
print(f"Meters path exists: {os.path.exists(METERS_PATH)}")

## 2. Building Metadata Analysis

In [None]:
# Load metadata
print("=" * 70)
print("LOADING BUILDING METADATA")
print("=" * 70)

metadata = pd.read_csv(METADATA_PATH)

print(f"\nMetadata loaded: {metadata.shape[0]} buildings, {metadata.shape[1]} features")
print(f"\nColumns: {list(metadata.columns)}")
metadata.head(10)

In [None]:
# Metadata overview
print("=" * 70)
print("METADATA SUMMARY")
print("=" * 70)

print(f"\nTotal buildings: {len(metadata)}")
print(f"\nData types:")
print(metadata.info())
print(f"\nMemory usage: {metadata.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Missing values in metadata
print("=" * 70)
print("MISSING VALUES IN METADATA")
print("=" * 70)

missing_meta = pd.DataFrame({
    'Column': metadata.columns,
    'Missing': metadata.isnull().sum(),
    'Percentage': (metadata.isnull().sum() / len(metadata) * 100).round(2)
}).sort_values('Missing', ascending=False)

print(missing_meta[missing_meta['Missing'] > 0].to_string(index=False))

# Visualize
if missing_meta['Missing'].sum() > 0:
    plt.figure(figsize=(14, 6))
    top_missing = missing_meta[missing_meta['Missing'] > 0].head(15)
    plt.barh(top_missing['Column'], top_missing['Percentage'], color='coral', edgecolor='black')
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Column', fontsize=12)
    plt.title('Top Missing Values in Metadata', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
# Building types distribution
print("=" * 70)
print("BUILDING TYPES ANALYSIS")
print("=" * 70)

if 'primaryspaceusage' in metadata.columns:
    building_types = metadata['primaryspaceusage'].value_counts()
    print(f"\nNumber of building types: {len(building_types)}")
    print(f"\nTop 10 building types:")
    print(building_types.head(10))
    
    # Visualize
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Top 15 building types
    building_types.head(15).plot(kind='barh', ax=ax1, color='skyblue', edgecolor='black')
    ax1.set_title('Top 15 Building Types', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Count')
    ax1.set_ylabel('Building Type')
    ax1.invert_yaxis()
    
    # Pie chart of top 10
    building_types.head(10).plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
    ax2.set_title('Top 10 Building Types Distribution', fontsize=14, fontweight='bold')
    ax2.set_ylabel('')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Sites analysis
print("=" * 70)
print("SITES ANALYSIS")
print("=" * 70)

if 'site_id' in metadata.columns:
    site_counts = metadata['site_id'].value_counts()
    print(f"\nNumber of sites: {len(site_counts)}")
    print(f"\nBuildings per site:")
    print(site_counts)
    
    # Visualize
    plt.figure(figsize=(14, 6))
    site_counts.plot(kind='bar', color='lightcoral', edgecolor='black', alpha=0.8)
    plt.title('Number of Buildings per Site', fontsize=14, fontweight='bold')
    plt.xlabel('Site ID', fontsize=12)
    plt.ylabel('Number of Buildings', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Building size analysis
print("=" * 70)
print("BUILDING SIZE DISTRIBUTION")
print("=" * 70)

if 'sqm' in metadata.columns:
    sqm_data = metadata['sqm'].dropna()
    
    print(f"\nBuildings with size data: {len(sqm_data)}")
    print(f"\nSize statistics (sqm):")
    print(sqm_data.describe())
    
    # Visualize
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    ax1.hist(sqm_data, bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
    ax1.axvline(sqm_data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {sqm_data.mean():.0f}')
    ax1.axvline(sqm_data.median(), color='blue', linestyle='--', linewidth=2, label=f'Median: {sqm_data.median():.0f}')
    ax1.set_title('Building Size Distribution', fontweight='bold')
    ax1.set_xlabel('Size (sqm)')
    ax1.set_ylabel('Frequency')
    ax1.legend()
    
    # Box plot
    ax2.boxplot(sqm_data, vert=True, patch_artist=True,
                boxprops=dict(facecolor='lightgreen', alpha=0.7),
                medianprops=dict(color='red', linewidth=2))
    ax2.set_title('Building Size Box Plot', fontweight='bold')
    ax2.set_ylabel('Size (sqm)')
    ax2.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Energy meters availability
print("=" * 70)
print("ENERGY METERS AVAILABILITY")
print("=" * 70)

meters = ['electricity', 'gas', 'hotwater', 'chilledwater', 'steam', 'water', 'irrigation', 'solar']
available_meters = [m for m in meters if m in metadata.columns]

meter_counts = {}
for meter in available_meters:
    count = (metadata[meter] == 'Yes').sum()
    meter_counts[meter] = count
    print(f"{meter.capitalize()}: {count} buildings ({count/len(metadata)*100:.1f}%)")

# Visualize
plt.figure(figsize=(12, 6))
plt.bar(meter_counts.keys(), meter_counts.values(), color='steelblue', edgecolor='black', alpha=0.8)
plt.title('Energy Meter Availability Across All Buildings', fontsize=14, fontweight='bold')
plt.xlabel('Meter Type', fontsize=12)
plt.ylabel('Number of Buildings', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Electricity Consumption Analysis

In [None]:
# Load electricity data (sample first to check size)
print("=" * 70)
print("LOADING ELECTRICITY DATA")
print("=" * 70)

electricity_path = os.path.join(METERS_PATH, 'electricity.csv')

# Load data
electricity = pd.read_csv(electricity_path, index_col=0, parse_dates=True)

print(f"\nElectricity data loaded")
print(f"Shape: {electricity.shape[0]:,} timestamps × {electricity.shape[1]} buildings")
print(f"Date range: {electricity.index.min()} to {electricity.index.max()}")
print(f"Total data points: {electricity.shape[0] * electricity.shape[1]:,}")
print(f"Memory usage: {electricity.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Electricity data quality
print("=" * 70)
print("ELECTRICITY DATA QUALITY")
print("=" * 70)

# Missing values per building
missing_pct = (electricity.isna().sum() / len(electricity)) * 100

print(f"\nMissing data statistics:")
print(f"  Mean: {missing_pct.mean():.2f}%")
print(f"  Median: {missing_pct.median():.2f}%")
print(f"  Min: {missing_pct.min():.2f}%")
print(f"  Max: {missing_pct.max():.2f}%")

# Data quality categories
excellent = (missing_pct < 5).sum()
good = ((missing_pct >= 5) & (missing_pct < 20)).sum()
fair = ((missing_pct >= 20) & (missing_pct < 50)).sum()
poor = (missing_pct >= 50).sum()

print(f"\nData quality breakdown:")
print(f"  Excellent (<5% missing): {excellent} buildings")
print(f"  Good (5-20% missing): {good} buildings")
print(f"  Fair (20-50% missing): {fair} buildings")
print(f"  Poor (>50% missing): {poor} buildings")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Histogram of missing percentages
ax1.hist(missing_pct, bins=50, color='salmon', edgecolor='black', alpha=0.7)
ax1.axvline(missing_pct.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {missing_pct.mean():.1f}%')
ax1.axvline(missing_pct.median(), color='blue', linestyle='--', linewidth=2, label=f'Median: {missing_pct.median():.1f}%')
ax1.set_title('Distribution of Missing Data', fontweight='bold', fontsize=14)
ax1.set_xlabel('Missing Percentage (%)')
ax1.set_ylabel('Number of Buildings')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Bar chart of quality categories
categories = ['Excellent\n(<5%)', 'Good\n(5-20%)', 'Fair\n(20-50%)', 'Poor\n(>50%)']
counts = [excellent, good, fair, poor]
colors_cat = ['green', 'yellowgreen', 'orange', 'red']
ax2.bar(categories, counts, color=colors_cat, edgecolor='black', alpha=0.8)
ax2.set_title('Data Quality Categories', fontweight='bold', fontsize=14)
ax2.set_ylabel('Number of Buildings')
ax2.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(counts):
    ax2.text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Top buildings by data quality
print("=" * 70)
print("BEST QUALITY BUILDINGS FOR ANALYSIS")
print("=" * 70)

# Get buildings with <20% missing
good_buildings = missing_pct[missing_pct < 20].sort_values()

print(f"\nBuildings with <20% missing data: {len(good_buildings)}")
print(f"\nTop 20 buildings (by data quality):")
print(good_buildings.head(20))

In [None]:
# Electricity consumption statistics
print("=" * 70)
print("ELECTRICITY CONSUMPTION STATISTICS")
print("=" * 70)

# Overall statistics
print(f"\nOverall consumption statistics (kWh):")
print(electricity.describe())

# Mean consumption per building
mean_consumption = electricity.mean().sort_values(ascending=False)

print(f"\nTop 10 consumers (mean kWh):")
print(mean_consumption.head(10))

print(f"\nLowest 10 consumers (mean kWh):")
print(mean_consumption.tail(10))

In [None]:
# Visualize consumption distribution
print("=" * 70)
print("CONSUMPTION PATTERNS VISUALIZATION")
print("=" * 70)

# Mean consumption per building
mean_by_building = electricity.mean()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of mean consumption
ax1.hist(mean_by_building.dropna(), bins=50, color='teal', edgecolor='black', alpha=0.7)
ax1.axvline(mean_by_building.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_by_building.mean():.0f}')
ax1.axvline(mean_by_building.median(), color='blue', linestyle='--', linewidth=2, label=f'Median: {mean_by_building.median():.0f}')
ax1.set_title('Mean Consumption Distribution', fontweight='bold', fontsize=14)
ax1.set_xlabel('Mean Consumption (kWh)')
ax1.set_ylabel('Number of Buildings')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Box plot
ax2.boxplot(mean_by_building.dropna(), vert=True, patch_artist=True,
            boxprops=dict(facecolor='teal', alpha=0.7),
            medianprops=dict(color='red', linewidth=2))
ax2.set_title('Mean Consumption Box Plot', fontweight='bold', fontsize=14)
ax2.set_ylabel('Mean Consumption (kWh)')
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Sample building time series
print("=" * 70)
print("SAMPLE BUILDING TIME SERIES")
print("=" * 70)

# Select 5 buildings with good data quality
sample_buildings = good_buildings.head(5).index.tolist()

print(f"\nSampling buildings: {sample_buildings}")

# Plot time series
fig, axes = plt.subplots(len(sample_buildings), 1, figsize=(16, 3*len(sample_buildings)))
if len(sample_buildings) == 1:
    axes = [axes]

for idx, building in enumerate(sample_buildings):
    data = electricity[building].dropna()
    axes[idx].plot(data.index, data.values, linewidth=0.5, alpha=0.7)
    axes[idx].set_title(f'{building} - Electricity Consumption', fontweight='bold')
    axes[idx].set_ylabel('kWh')
    axes[idx].grid(alpha=0.3)
    
    # Add statistics
    stats_text = f"Mean: {data.mean():.1f} | Std: {data.std():.1f} | Min: {data.min():.1f} | Max: {data.max():.1f}"
    axes[idx].text(0.02, 0.95, stats_text, transform=axes[idx].transAxes,
                  bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                  verticalalignment='top')

plt.tight_layout()
plt.show()

## 4. Temporal Patterns Analysis

In [None]:
# Overall temporal patterns
print("=" * 70)
print("TEMPORAL PATTERNS - HOURLY, DAILY, MONTHLY")
print("=" * 70)

# Calculate mean consumption over time
hourly_mean = electricity.groupby(electricity.index.hour).mean().mean(axis=1)
daily_mean = electricity.resample('D').mean().mean(axis=1)
monthly_mean = electricity.resample('M').mean().mean(axis=1)

# Create temporal plots
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(16, 12))

# Hourly pattern
ax1.plot(hourly_mean.index, hourly_mean.values, marker='o', linewidth=2, markersize=8, color='blue')
ax1.set_title('Average Hourly Consumption Pattern', fontweight='bold', fontsize=14)
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Mean Consumption (kWh)')
ax1.set_xticks(range(24))
ax1.grid(alpha=0.3)
ax1.fill_between(hourly_mean.index, hourly_mean.values, alpha=0.3, color='blue')

# Daily pattern
ax2.plot(daily_mean.index, daily_mean.values, linewidth=1.5, color='green', alpha=0.8)
ax2.set_title('Daily Average Consumption', fontweight='bold', fontsize=14)
ax2.set_xlabel('Date')
ax2.set_ylabel('Mean Consumption (kWh)')
ax2.grid(alpha=0.3)

# Monthly pattern
ax3.plot(monthly_mean.index, monthly_mean.values, marker='o', linewidth=2, markersize=10, color='red')
ax3.set_title('Monthly Average Consumption', fontweight='bold', fontsize=14)
ax3.set_xlabel('Month')
ax3.set_ylabel('Mean Consumption (kWh)')
ax3.grid(alpha=0.3)
ax3.fill_between(range(len(monthly_mean)), monthly_mean.values, alpha=0.3, color='red')

plt.tight_layout()
plt.show()

print(f"\nPeak hour: {hourly_mean.idxmax()}:00")
print(f"Lowest hour: {hourly_mean.idxmin()}:00")
print(f"Peak month: {monthly_mean.idxmax().strftime('%B %Y')}")
print(f"Lowest month: {monthly_mean.idxmin().strftime('%B %Y')}")

In [None]:
# Day of week patterns
print("=" * 70)
print("DAY OF WEEK PATTERNS")
print("=" * 70)

# Calculate mean by day of week
dow_mean = electricity.groupby(electricity.index.dayofweek).mean().mean(axis=1)
dow_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

print(f"\nMean consumption by day of week:")
for idx, day in enumerate(dow_names):
    print(f"  {day}: {dow_mean[idx]:.2f} kWh")

# Visualize
plt.figure(figsize=(12, 6))
plt.bar(dow_names, dow_mean.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DFE6E9', '#74B9FF'],
        edgecolor='black', alpha=0.8)
plt.title('Average Consumption by Day of Week', fontweight='bold', fontsize=14)
plt.xlabel('Day of Week')
plt.ylabel('Mean Consumption (kWh)')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(dow_mean.values):
    plt.text(i, v + dow_mean.max()*0.01, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Weather Data Analysis

In [None]:
# Load weather data
print("=" * 70)
print("LOADING WEATHER DATA")
print("=" * 70)

weather = pd.read_csv(WEATHER_PATH)

print(f"\nWeather data loaded")
print(f"Shape: {weather.shape}")
print(f"\nColumns: {list(weather.columns)}")
print(f"\nFirst few rows:")
weather.head(10)

In [None]:
# Weather data summary
print("=" * 70)
print("WEATHER DATA SUMMARY")
print("=" * 70)

print(f"\nWeather observations: {len(weather):,}")
print(f"\nSites with weather data:")
if 'site_id' in weather.columns:
    print(weather['site_id'].value_counts())

print(f"\nWeather statistics:")
weather.describe()

In [None]:
# Weather variables analysis
print("=" * 70)
print("WEATHER VARIABLES DISTRIBUTION")
print("=" * 70)

# Select numeric weather columns
weather_vars = weather.select_dtypes(include=[np.number]).columns.tolist()
if 'site_id' in weather_vars:
    weather_vars.remove('site_id')

print(f"\nWeather variables: {weather_vars}")

# Visualize distributions
n_vars = len(weather_vars[:6])  # Show first 6 variables
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, var in enumerate(weather_vars[:6]):
    data = weather[var].dropna()
    if len(data) > 0:
        axes[idx].hist(data, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'{var} Distribution', fontweight='bold')
        axes[idx].set_xlabel(var)
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(axis='y', alpha=0.3)
        
        # Add statistics
        stats_text = f"Mean: {data.mean():.1f}\nStd: {data.std():.1f}"
        axes[idx].text(0.98, 0.97, stats_text, transform=axes[idx].transAxes,
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                      verticalalignment='top', horizontalalignment='right')

plt.tight_layout()
plt.show()

## 6. Energy-Weather Correlation Analysis

In [None]:
# Prepare data for correlation analysis
print("=" * 70)
print("ENERGY-WEATHER CORRELATION ANALYSIS")
print("=" * 70)

print("\n⏳ Preparing data for correlation analysis...")
print("This analysis helps identify weather impacts on energy consumption")

## 7. Multi-Meter Comparison

In [None]:
# Compare different meter types
print("=" * 70)
print("MULTI-METER COMPARISON")
print("=" * 70)

# List available meter files
meter_files = [f for f in os.listdir(METERS_PATH) if f.endswith('.csv')]
print(f"\nAvailable meter types: {meter_files}")

meter_summary = {}
for meter_file in meter_files:
    meter_name = meter_file.replace('.csv', '')
    meter_path = os.path.join(METERS_PATH, meter_file)
    
    # Load just to get dimensions
    meter_data = pd.read_csv(meter_path, index_col=0, nrows=5)
    
    # Get full row count
    with open(meter_path, 'r') as f:
        row_count = sum(1 for line in f) - 1  # Subtract header
    
    meter_summary[meter_name] = {
        'buildings': len(meter_data.columns),
        'timestamps': row_count
    }
    print(f"\n{meter_name.capitalize()}:")
    print(f"  Buildings: {meter_summary[meter_name]['buildings']}")
    print(f"  Timestamps: {meter_summary[meter_name]['timestamps']:,}")

# Visualize meter coverage
meter_buildings = [meter_summary[m]['buildings'] for m in meter_summary.keys()]

plt.figure(figsize=(12, 6))
plt.bar(meter_summary.keys(), meter_buildings, color='mediumpurple', edgecolor='black', alpha=0.8)
plt.title('Number of Buildings per Meter Type', fontweight='bold', fontsize=14)
plt.xlabel('Meter Type')
plt.ylabel('Number of Buildings')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(meter_buildings):
    plt.text(i, v + max(meter_buildings)*0.01, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 8. Transfer Learning Candidate Selection

In [None]:
# Identify best building pairs for transfer learning
print("=" * 70)
print("TRANSFER LEARNING CANDIDATE ANALYSIS")
print("=" * 70)

# Get buildings with good data quality (<20% missing)
good_quality_buildings = missing_pct[missing_pct < 20].index.tolist()

print(f"\nBuildings with good data quality (<20% missing): {len(good_quality_buildings)}")

# Match with metadata
if 'building_id' in metadata.columns:
    good_buildings_meta = metadata[metadata['building_id'].isin(good_quality_buildings)]
    
    print(f"\nBuilding types in good quality subset:")
    if 'primaryspaceusage' in good_buildings_meta.columns:
        type_counts = good_buildings_meta['primaryspaceusage'].value_counts()
        print(type_counts)
        
        # Identify transfer learning opportunities
        print(f"\nTRANSFER LEARNING OPPORTUNITIES:")
        
        # Same-type transfer (buildings of same type)
        same_type = type_counts[type_counts >= 2]
        if len(same_type) > 0:
            print(f"\n  Same-Type Transfer (within building type):")
            for building_type, count in same_type.items():
                buildings = good_buildings_meta[good_buildings_meta['primaryspaceusage'] == building_type]['building_id'].tolist()
                print(f"     {building_type}: {count} buildings")
                print(f"       Buildings: {buildings[:5]}" + ("..." if len(buildings) > 5 else ""))
        
        # Cross-type transfer
        if len(type_counts) > 1:
            print(f"\n  Cross-Type Transfer (between building types):")
            print(f"     {len(type_counts)} different building types available")
            print(f"     Most common: {type_counts.index[0]} ({type_counts.iloc[0]} buildings)")
            print(f"     Can transfer from {type_counts.index[0]} to other types")
    
    # Site-based transfer
    if 'site_id' in good_buildings_meta.columns:
        site_counts = good_buildings_meta['site_id'].value_counts()
        same_site = site_counts[site_counts >= 2]
        
        if len(same_site) > 0:
            print(f"\n  Same-Site Transfer (shared climate):")
            for site, count in same_site.items():
                buildings = good_buildings_meta[good_buildings_meta['site_id'] == site]['building_id'].tolist()
                print(f"     Site {site}: {count} buildings")
                print(f"       Buildings: {buildings[:5]}" + ("..." if len(buildings) > 5 else ""))

## 9. Data Quality Recommendations

In [None]:
# Provide recommendations
print("=" * 70)
print("DATA QUALITY RECOMMENDATIONS")
print("=" * 70)

print(f"\nDATASET OVERVIEW:")
print(f"   Total buildings in metadata: {len(metadata)}")
print(f"   Buildings with electricity data: {electricity.shape[1]}")
print(f"   Buildings with good quality (<20% missing): {len(good_quality_buildings)}")

print(f"\nRECOMMENDATIONS:")
print(f"\n1. DATA SELECTION:")
print(f"   - Focus on {len(good_quality_buildings)} buildings with <20% missing data")
print(f"   - Prioritize buildings with complete metadata (size, type, year)")
print(f"   - Consider buildings from multiple sites for weather diversity")

print(f"\n2. DATA PREPROCESSING:")
print(f"   - Handle missing values (interpolation for short gaps <3 hours)")
print(f"   - Remove outliers (values >10x the 95th percentile)")
print(f"   - Normalize consumption by building size if available")
print(f"   - Address extended zero periods (>72 hours)")

print(f"\n3. FEATURE ENGINEERING:")
print(f"   - Add temporal features (hour, day of week, month)")
print(f"   - Include cyclical encoding for time (sin/cos transformations)")
print(f"   - Merge weather data by site and timestamp")
print(f"   - Create lag features for past consumption")

print(f"\n4. TRANSFER LEARNING STRATEGY:")
print(f"   - Start with same-type transfer (e.g., Education → Education)")
print(f"   - Test same-site transfer to leverage climate similarity")
print(f"   - Experiment with cross-type transfer for generalization")
print(f"   - Use buildings with most data as source models")

print(f"\n5. VALIDATION:")
print(f"   - Use temporal split (not random) to preserve time series structure")
print(f"   - Reserve recent data for testing")
print(f"   - Compare against baseline models trained from scratch")
print(f"   - Evaluate on multiple target buildings to assess generalization")

## 10. Export Summary Report

In [None]:
# Create comprehensive summary report
print("=" * 70)
print("CREATING SUMMARY REPORT")
print("=" * 70)

summary = {
    'Analysis Date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'Total Buildings (Metadata)': len(metadata),
    'Buildings with Electricity': electricity.shape[1],
    'Electricity Timestamps': electricity.shape[0],
    'Date Range Start': str(electricity.index.min()),
    'Date Range End': str(electricity.index.max()),
    'Buildings Good Quality (<20% missing)': len(good_quality_buildings),
    'Buildings Excellent Quality (<5% missing)': excellent,
    'Mean Missing Percentage': f"{missing_pct.mean():.2f}%",
    'Building Types': metadata['primaryspaceusage'].nunique() if 'primaryspaceusage' in metadata.columns else 'N/A',
    'Sites': metadata['site_id'].nunique() if 'site_id' in metadata.columns else 'N/A',
    'Peak Hour': f"{hourly_mean.idxmax()}:00",
    'Lowest Hour': f"{hourly_mean.idxmin()}:00",
}

# Display summary
print("\nANALYSIS SUMMARY:")
for key, value in summary.items():
    print(f"   {key}: {value}")

# Save to CSV
summary_df = pd.DataFrame([summary])
report_path = '../results/raw_data_analysis_summary.csv'
os.makedirs('../results', exist_ok=True)
summary_df.to_csv(report_path, index=False)
print(f"\nSummary report saved to: {report_path}")

# Save good quality buildings list
good_buildings_df = pd.DataFrame({
    'building_id': good_quality_buildings,
    'missing_percentage': [missing_pct[b] for b in good_quality_buildings]
}).sort_values('missing_percentage')
good_buildings_path = '../results/good_quality_buildings.csv'
good_buildings_df.to_csv(good_buildings_path, index=False)
print(f"Good quality buildings list saved to: {good_buildings_path}")

print("\n" + "=" * 70)
print("ANALYSIS COMPLETE!")
print("=" * 70)