## Create resstock end-use load dataframes at desired aggregation level

This notebook does the following:
1. Import data (from ResStock Parquet files in SMART-DS)
2. Aggregate at resstock building level or aggregated at smart-ds feeder / regional level (aggregates each building type the number of times it appears in phase loads in the feeder/region)
3. Add cooling sum
4. Save as dictionary 

### Import Packages

In [1]:
import numpy as np
import pyarrow.parquet as pq
import joblib
import scipy
import pandas as pd
import time
import os
import re
import glob
from datetime import datetime, timedelta
import yaml
import pprint

from src import figure_ops
from src import input_ops
from src import model_ops

### Load config file

In [None]:
config_file_name = 'config1'; config_path = f"config/{config_file_name}.yaml"; config = input_ops.load_config(config_path)
pprint.pprint(config, sort_dicts=False)

### Import, process and save input data

In [1]:
# Import, process and save load and weather data (run only once per dataset, e.g., run again if want to add new regions or years)
start_time = time.time()

# # Define dictionary to store results
smartds_load_dict = {}
smart_ds_year = config['smart_ds_years'][0]

smart_ds_load_path = config['smart_ds_load_path'] + f"/{smart_ds_year}"
CITY_REGIONS_TO_RUN = config['CITY_REGIONS_TO_RUN']
aggregation_level = config['aggregation_level']
building_types = config['building_types']
start_month = config['start_month']
end_month = config['end_month']

columns_to_sum = ['cooling_kw', 'fans_kw', 'refrigeration_kw']

# Loop through all city, region, year, and building type combinations
for city, regions in CITY_REGIONS_TO_RUN.items():
    for region in regions:
        ### Import and process smart-ds load data ###
        parquet_data_path = f'main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/load_data' # path to parquet files
        region_directory = f"main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/scenarios/base_timeseries/opendss/" # path to where feeder folders are
        # Create a list of load_models (regional / feeders / buildings)   
        match aggregation_level:
            case 'regional':
                load_models = ['regional']
            case 'feeder':
                # load_models = find_feeder_folders(region_directory) # list of feeders in the format of '/substation/feeder/'
                load_models = input_ops.make_feeder_list(region_directory) # list of feeders
                print(f'...Loaded feeders in {smart_ds_year} {city} {region}: {load_models}')
            case _: # building
                load_models = sorted([f for f in os.listdir(parquet_data_path) if f.endswith(".parquet")]) # list of buildings in the format of 'building_name.parquet' 
                # print(f'...Loaded buildings in {smart_ds_year} {city} {region}: {load_models}')
        print(f'......Creating data frame for {smart_ds_year} {city} {region} ......')
        for load_model in load_models:
            for building_type in building_types:
                if aggregation_level == 'regional' or aggregation_level == 'feeder':
                    print(f'......Creating data frame for {smart_ds_year} {city} {region} {load_model} {building_type}......')
                    feeder_path_name = input_ops.add_feeder_upper_folder(load_model)
                    feeder_path = f'main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/scenarios/base_timeseries/opendss/{feeder_path_name}/Loads.dss'
                    load_df = input_ops.aggregate_parquet_data(parquet_data_path, feeder_path, building_type, aggregation_level, start_month, end_month)  # 'res' for residential | 'com' for commercial
                    if load_df.empty:
                        print(f"Skipped {feeder_path} since it didn't have parquet files found in the given folder with the specified prefix.")
                        continue 
                else:
                    load_prefix = load_model.split('_')[0]
                    if load_prefix not in {'com', 'res'}:
                        continue
                    load_df = input_ops.get_parquet_load_data(parquet_data_path, load_model,start_month,end_month)
                
                # Set index to date_time and remove month which gets strage values (temporary solution) 
                load_df = load_df.reset_index()
                load_df.index = load_df["date_time"] # Make date_time the index
                input_df = load_df 
                input_df = input_df.drop('month', axis=1)
                
                # Add Cooling sum
                # Ensure all columns exist in the dataframe before summing
                if all(col in input_df.columns for col in columns_to_sum):
                    # Check if 'cooling_kw_sum' already exists
                    if 'cooling_kw_sum' not in input_df.columns:
                        input_df['cooling_kw_sum'] = input_df[columns_to_sum].sum(axis=1)
                
                # Store in dictionary with (year, city, region, load_model, building_type) as the key
                smartds_load_dict[(smart_ds_year, city, region, load_model, building_type)] = input_df
        # Save dictionary of current region as joblib files
        if aggregation_level == 'building':
            input_data_region_dir = f'{smart_ds_load_path}/{city}/{region}/'
            if not os.path.exists(input_data_region_dir):
                os.makedirs(input_data_region_dir, exist_ok=True)
            print(f'saving joblib for {city} {region}')
            joblib.dump(smartds_load_dict, os.path.join(input_data_region_dir, f"smartds_load_dict.joblib"))                  
                     
# Save dictionary of all regions as joblib files
if aggregation_level == 'regional' or aggregation_level == 'feeder':
    print('saving joblib for all regions')
    input_data_region_dir = f'{smart_ds_load_path}/all_regions/'
    os.makedirs(input_data_region_dir, exist_ok=True)     # Ensure the directory exists
    joblib.dump(smartds_load_dict, os.path.join(input_data_region_dir, "smartds_load_dict.joblib")) # Save the file

end_time = time.time(); print(f"Runtime for loading data: {(end_time - start_time) / 60:.2f} minutes")