## Process smart-ds load and resstock weather data and prepare input data for regression model training

This notebook does the following:
1. Import weather and load data
2. Aggregate load data
3. Save weather and load data in a dictionary 

### Import Packages

In [1]:
import numpy as np
import pyarrow.parquet as pq
import joblib
import scipy
import pandas as pd
import time
import os
import re
import glob
from datetime import datetime, timedelta
import yaml
import pprint

from src import figure_ops
from src import input_ops
from src import model_ops

### Load config file with scenarios and parameters 

In [2]:
config_file_name = 'config1'; config_path = f"config/{config_file_name}.yaml"; config = input_ops.load_config(config_path)

input_data_training_path = config['input_data_training_path']
smart_ds_years = config['smart_ds_years']
CITY_REGIONS_TO_RUN = config['CITY_REGIONS_TO_RUN']
aggregation_level = config['aggregation_level']
building_types = config['building_types']
start_month = config['start_month']
end_month = config['end_month']

### Import, process and save input data

In [1]:
# Import, process and save load and weather data (run only once per dataset, e.g., run again to add new regions or years)
start_time = time.time()

# # Define dictionary to store results
input_data_dict = {}

# Loop through all city, region, year, and building type combinations
for smart_ds_year in smart_ds_years:
    for city, regions in CITY_REGIONS_TO_RUN.items():
        for region in regions:
            ### Import and process resstock weather data ###
            # Load weather file based on city-reigon that matches ResStock county (FIPS) 
            match city:
                case 'SFO':
                    weather_file = f'G0600750_{smart_ds_year}.csv'  # San-Francisco county
                case 'AUS':
                    weather_file = f'G4800150_{smart_ds_year}.csv'  # Austin county
                case _:
                    weather_file = f'G3700810_{smart_ds_year}.csv'  # Greensboro Guilford county
            weather_data_path = f'main_folder/OpenDSS/raw_data/resstock_weather/amy{smart_ds_year}/{city}/{weather_file}'
            weather_df = input_ops.import_resstock_weather_data(weather_data_path, start_month, end_month)
            
            ### Import and process smart-ds load data ###
            parquet_data_path = f'main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/load_data' # path to parquet files
            region_directory = f"main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/scenarios/base_timeseries/opendss/" # path to where feeder folders are
            # Create a list of load_models (regional / feeders / buildings)   
            match aggregation_level:
                case 'regional':
                    load_models = ['regional']
                case 'feeder':
                    # load_models = find_feeder_folders(region_directory) # list of feeders in the format of '/substation/feeder/'
                    load_models = input_ops.make_feeder_list(region_directory) # list of feeders
                case _: # building
                    load_models = sorted([f for f in os.listdir(parquet_data_path) if f.endswith(".parquet")]) # list of buildings in the format of 'building_name.parquet' 
            print(f'......Creating data frame for {smart_ds_year} {city} {region} ......')
            for load_model in load_models:
                for building_type in building_types:
                    print(f'......Creating data frame for {smart_ds_year} {city} {region} {load_model} {building_type}......')
                    match aggregation_level:
                        case 'regional':
                            loads_dss_paths = input_ops.find_folders_with_file(region_directory, "Loads.dss")
                            load_df = input_ops.aggregate_parquet_data(parquet_data_path, loads_dss_paths, building_type, aggregation_level, start_month, end_month)  # 'res' for residential | 'com' for commercial
                        case 'feeder':
                            feeder_path_name = input_ops.add_feeder_upper_folder(load_model)
                            feeder_path = f'main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/scenarios/base_timeseries/opendss/{feeder_path_name}'
                            loads_dss_paths = [feeder_path]
                            load_df = input_ops.aggregate_parquet_data(parquet_data_path, loads_dss_paths, building_type, aggregation_level, start_month, end_month)  # 'res' for residential | 'com' for commercial
                        case _: # building
                            load_prefix = load_model.split('_')[0]
                            if load_prefix not in {'com', 'res'}:
                                continue
                            load_df = input_ops.get_parquet_load_data(parquet_data_path, load_model,start_month,end_month)
                    if load_df.empty:
                        print(f"load_df is empty, perhaps parquet files were not found in the given folder with the specified prefix.")
                        continue 

                    # Reset index in load df (since 'building_id' is set as an index and not date time as in weather df)
                    load_df = load_df.reset_index()
                    # Merge data
                    input_df = input_ops.merge_load_weather(load_df, weather_df)
                    input_df.rename(columns={'month_x': 'month'}, inplace=True)
                    # Store in dictionary with (year, city, region, load_model, building_type) as the key
                    input_data_dict[(smart_ds_year, city, region, load_model, building_type)] = input_df
            # Save dictionary of current region as joblib files
            if aggregation_level == 'building':
                input_data_region_dir = f'{input_data_training_path}/{city}/{region}/'
                if not os.path.exists(input_data_region_dir):
                    os.makedirs(input_data_region_dir, exist_ok=True)
                print(f'saving joblib for {city} {region}')
                joblib.dump(input_data_dict, os.path.join(input_data_region_dir, f"input_data_dict.joblib"))                  
                     
# Save dictionary of all regions as joblib files
if aggregation_level == 'regional' or aggregation_level == 'feeder':
    print('saving joblib for all regions')
    os.makedirs(input_data_training_path, exist_ok=True)     # Ensure the directory exists
    joblib.dump(input_data_dict, os.path.join(input_data_training_path, "input_data_dict.joblib")) # Save the file

end_time = time.time(); print(f"Runtime for loading data: {(end_time - start_time) / 60:.2f} minutes")