# Create Load data for disaggregration process of feeder level cooling and heating predictions

This notebook does the following:
1. Import load data (from ResStock Parquet files in SMART-DS)
2. Filter parquet_files based on feeder's loads.dss 
3. Create columns: total demand, cooling sum, heating and non cooling-heating load
4. Save as dictionary, clustered by feeders (for disaggregration of feeder level cooling and heating predictions)

### Import packages

In [1]:
import numpy as np
import pyarrow.parquet as pq
import joblib
import scipy
import pandas as pd
from pandas import DatetimeIndex
import time
import os
import re
import glob
import gc
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from datetime import datetime, timedelta
import seaborn as sns
import pytz

import yaml
import pprint

from src import input_ops
from src import model_ops
from src import aux_ops

### Load config file with scenarios and parameters 

In [2]:
config_file_name = 'config1'; config_path = f"config/{config_file_name}.yaml"; config = input_ops.load_config(config_path)

# lists of parameters to use for TGW df creation
TGW_scenario = config['TGW_scenario']
TGW_weather_years = config['TGW_weather_years']
prediction_model_str = config['prediction_model']
aggregation_level = config['aggregation_level']

X_columns = config[config['X_columns_set']]
CITY_REGIONS_TO_RUN = config['CITY_REGIONS_TO_RUN']

input_data_dict_name = config['input_data_dict_name']
aggregation_level = config['aggregation_level']
smart_ds_year = config['smart_ds_years'][0]
building_types = config["building_types"]

input_data_training_path = config['input_data_training_path']
CITY_REGIONS_TO_RUN = config['CITY_REGIONS_TO_RUN']
start_month = config['start_month']
end_month = config['end_month']

## Initialize parameters for saving paths
Y_column = config['Y_column']
input_data_prediction_path = config['input_data_prediction_path']
output_path_prediction_str = config['output_data_prediction_path']

smart_ds_year = config['smart_ds_years'][0]
smart_ds_load_path = config['smart_ds_load_path'] + f"/{smart_ds_year}"

### Create dictionary of measured building cooling and heating per feeder (from resstock parquet files)

In [2]:
start_time = time.time()

# Loop through all city, region, year, and building type combinations
for city, regions in CITY_REGIONS_TO_RUN.items():
    if aggregation_level != 'feeder':
        raise ValueError("This code supports feeder-level aggregation only - check aggregation_level value in config file")
    for region in regions:
        # Initialize dictionary to store region results
        measured_buildings_cool_heat_dict = {}
        ### Import and process smart-ds load data ###
        parquet_data_path = f'main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/load_data' # path to parquet files
        region_directory = f"main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/scenarios/base_timeseries/opendss/" # path to where feeder folders are
        # Create a list of load_models (regional / feeders / buildings)   
        load_models = input_ops.make_feeder_list(region_directory) # list of feeders
        print(f'......Creating data frame for {smart_ds_year} {city} {region} ......')
        for load_model in load_models: # for feeder in feeders list
            for building_type in building_types:
                print(f'......Creating data frames for {smart_ds_year} {city} {region} feeder {load_model} {building_type}......')
                ### Get all parquet files from load_data folder (all res/com stock profiles in the region) ###
                parquet_files = sorted([f for f in os.listdir(parquet_data_path) if f.startswith(building_type) and f.endswith(".parquet")])
                feeder_path_name = input_ops.add_feeder_upper_folder(load_model)
                feeder_path = f'main_folder/SMART-DS/v1.0/{smart_ds_year}/{city}/{region}/scenarios/base_timeseries/opendss/{feeder_path_name}'
                ## filter parquet_files list and keep only res/com stock profiles that exist in the feeder (Extract valid names from Loads.dss)  
                valid_names = set()
                file_path = feeder_path + "/Loads.dss"
                with open(file_path, "r") as f:
                    for line in f:
                        match = re.search(r"yearly=(res|com)_kw_(\d+)", line)
                        if match:
                            valid_names.add(f"{match.group(1)}_{match.group(2)}")  # e.g., Extract "res_376" or "com_10111"
                # Filter parquet_files based on feeder's loads.dss and skip to next feeder if non exist
                parquet_files = [f for f in parquet_files if f.replace(".parquet", "") in valid_names]
                if not parquet_files:
                    print(f"Feeder {feeder_path} skipped since no parquet files found in the given folder with the specified prefix.")                            
                    continue 
                    
                # Initialize dictionary for this feeder and building type
                measured_buildings_cool_heat_dict[(smart_ds_year, city, region, load_model, building_type)] = {}

                # Loop through all parquet files in feeder, process them and add to dictionary
                for file in parquet_files:
                    parquet_file_path = os.path.join(parquet_data_path, file)
                    load_prefix = file.split('_')[0]

                    if load_prefix not in {'com', 'res'}:
                        print(f"Parquet file {file} skipped since it didn't start with 'com' or 'res'")
                        continue

                    load_df = input_ops.get_parquet_load_data(parquet_data_path, parquet_file_path, start_month, end_month)
                    load_df = input_ops.convert_columns_to_CH_and_non_CH(load_df)
                    load_df = load_df.set_index('date_time')

                    building_id = file.split('.')[0]
                    measured_buildings_cool_heat_dict[(smart_ds_year, city, region, load_model, building_type)][building_id] = load_df

        # Save joblib for city-region
        print(f'saving joblib for city {city} region {region}')
        input_data_region_dir = f'{smart_ds_load_path}/{city}/{region}/buildings'
        os.makedirs(input_data_region_dir, exist_ok=True)     # Ensure the directory exists
        joblib.dump(measured_buildings_cool_heat_dict, os.path.join(input_data_region_dir, "measured_buildings_cool_heat_dict.joblib")) # Save the file
end_time = time.time(); print(f"Runtime for loading data: {(end_time - start_time) / 60:.2f} minutes")

### Load measured buildings cooling heating dict (single region)

In [1]:
city = 'GSO'
region = 'rural'
smart_ds_year = config['smart_ds_years'][0]
smart_ds_load_path = config['smart_ds_load_path'] + f"/{smart_ds_year}"
input_data_region_dir = f'{smart_ds_load_path}/{city}/{region}/buildings'
measured_buildings_cool_heat_dict = joblib.load(os.path.join(input_data_region_dir, "measured_buildings_cool_heat_dict.joblib")) # Save the file

measured_buildings_cool_heat_dict.keys()