# Process the Load Data for the NAERM Project


In [1]:
# Start by importing the packages we need:
import os
import glob
import datetime
import warnings

import pandas as pd
import numpy as np


## Set the Directory Structure

In [2]:
# Identify the data input and image output directories:
tell_data_input_dir =  '/Users/burl878/Documents/Code/code_repos/naerm_heat_wave_loads/data/TELL_Data/'
data_output_dir =  '/Users/burl878/Documents/Code/code_repos/naerm_heat_wave_loads/data/'


# Suppress Future Warnings


In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)


## Create a Function to Process the 2035 GridView Data Used in Scaling


In [25]:
def process_gridview_data(data_input_dir: str):
    # Read in the raw data .csv file:
    gv_df = pd.read_csv((data_input_dir + '2021_1_Heatwave_Load_stress.csv'))

    # Subset to just the annual total demand by BA:
    gv_df = gv_df[-3:-2]
       
    # Strip the unecessary bits from the column names:
    gv_df.columns = gv_df.columns.str.replace("_2030.dat", "")
    gv_df.columns = gv_df.columns.str.replace("Load_", "")
       
    # Delete the index column:
    del gv_df["Index"] 
    
    # Convert the values to floats:
    gv_df = gv_df.astype('float64')
    
    # Compute the total loads for CISO, IPCO, NEVP, and PACE:
    gv_df['CISO'] = (gv_df['CIPB'] + gv_df['CIPV'] + gv_df['CISC'] + gv_df['CISD'] + gv_df['VEA']).round(2)
    gv_df['IPCO'] = (gv_df['IPFE'] + gv_df['IPMV'] + gv_df['IPTV']).round(2)
    gv_df['PACE'] = (gv_df['PAID'] + gv_df['PAUT'] + gv_df['PAWY']).round(2)
    gv_df['NEVP_Sum'] = (gv_df['NEVP'] + gv_df['SPPC']).round(2)
           
    # Rename a few columns for consistency:
    gv_df.rename(columns={'CIPB': 'CISO_CIPB', 'CIPV': 'CISO_CIPV', 'CISC': 'CISO_CISC', 'CISD': 'CISO_CISD', 'VEA': 'CISO_VEA',
                          'IPFE': 'IPCO_IPFE', 'IPMV': 'IPCO_IPMV', 'IPTV': 'IPCO_IPTV',
                          'NEVP': 'NEVP_NEVP', 'SPPC': 'NEVP_SPPC',
                          'PAID': 'PACE_PAID', 'PAUT': 'PACE_PAUT', 'PAWY': 'PACE_PAWY'}, inplace=True) 
    gv_df.rename(columns={'NEVP_Sum': 'NEVP'}, inplace=True) 
    
    # Squeeze the dataframe:
    gv_df = gv_df.squeeze().to_frame()
        
    # Rename the columns:
    gv_df.reset_index(inplace=True)
    gv_df = gv_df.rename(columns = {'index':'BA'})
    gv_df.rename(columns={gv_df.columns[1]: "Total_Load_MWh" }, inplace = True)
       
    # Sort the dataframe alphabetically by BA name:
    gv_df = gv_df.sort_values('BA')
    
    # Return the output dataframe:
    return gv_df


In [26]:
gv_df = process_gridview_data(data_input_dir = data_output_dir)

gv_df


Unnamed: 0,BA,Total_Load_MWh
0,AESO,61600000.0
1,AVA,12500000.0
2,AZPS,30900000.0
3,BANC,17300000.0
4,BCHA,65100000.0
5,BPAT,55800000.0
6,CFE,14600000.0
7,CHPD,1856827.0
43,CISO,218597615.0
8,CISO_CIPB,40000000.0


## Create a Function to Aggregate the Raw TELL MLP Output into a Single Dataframe:


In [27]:
def aggregate_mlp_output_files(tell_data_input_dir: str, year_to_process: str):
    
    # Create a list of all of the MLP output files in the "mlp_input_dir" and aggregate the files in that list:
    list_of_files = sorted(glob.glob(os.path.join(tell_data_input_dir, year_to_process, '*_mlp_output.csv')))

    # Loop over the list of MLP output files:
    for file in range(len(list_of_files)):

        # Read in the .csv file and replace missing values with nan:
        mlp_data = pd.read_csv(list_of_files[file]).replace(-9999, np.nan)

        # Rename the "Load" variable:
        mlp_data.rename(columns={'Load': 'Hourly_Load_MWh'}, inplace=True)

        # Replacing missing or negative loads with NaN:
        mlp_data.loc[~(mlp_data['Hourly_Load_MWh'] > 0), 'Hourly_Load_MWh'] = np.nan

        # Aggregate the output into a new dataframe:
        if file == 0:
            tell_df = mlp_data
        else:
            tell_df = pd.concat([tell_df, mlp_data])
    
    # Return the output dataframe:
    return tell_df


## Create a Function to Scale the TELL Output Based on the GridView 2035 Values:


In [32]:
def scale_tell_loads(data_input_dir: str, tell_data_input_dir: str, year_to_process: str):
    
    # Aggregate the TELL MLP files:
    tell_df = aggregate_mlp_output_files(tell_data_input_dir = tell_data_input_dir,
                                         year_to_process = year_to_process)
    
    # Process the GridView file and rename a column for consistency:
    gv_df = process_gridview_data(data_input_dir = data_input_dir)
    gv_df.rename(columns={'Total_Load_MWh': 'GV_Total_Load_MWh'}, inplace=True) 
    
    # Merge the tell_df and gv_df dataframes based on common BA names:
    merged_df = tell_df.merge(gv_df, on=['BA'])
    
    # Sum the hourly TELL loads by BA into annual total loads:
    merged_df['TELL_Total_Load_MWh'] = merged_df.groupby('BA')['Hourly_Load_MWh'].transform('sum')
    
    # Compute the scaling factors that force the annual total loads to agree:
    merged_df['Scaling_Factor'] = merged_df['GV_Total_Load_MWh'] / merged_df['TELL_Total_Load_MWh']
    
    # Compute the scaled hourly loads:
    merged_df['Hourly_Load_MWh_Scaled'] = merged_df['Hourly_Load_MWh'] * merged_df['Scaling_Factor']
    
    # Compute the hours since the start of the year:
    merged_df['Hour'] = ((pd.to_datetime(merged_df['Time_UTC']) - datetime.datetime(int(year_to_process), 1, 1, 0, 0, 0)) / np.timedelta64(1, 'h') + 1).astype(int)
    
    # Only keep the columns that are needed:
    scaled_tell_df = merged_df[['Hour', 'BA', 'Hourly_Load_MWh_Scaled']].copy()
    
    # Drop the rows with missing values (i.e., there is not a corresponding GridView load):
    scaled_tell_df = scaled_tell_df.dropna(how = 'any')
    
    # Rename the load variable and round it to 5 decimals:
    scaled_tell_df.rename(columns={'Hourly_Load_MWh_Scaled': 'Load_MWh'}, inplace=True)
    scaled_tell_df['Load_MWh'] = scaled_tell_df['Load_MWh'].round(5)
    
    # Return the output dataframe:
    return scaled_tell_df


In [33]:
# Aggregate the TELL MLP files:
scaled_tell_df = scale_tell_loads(data_input_dir = data_output_dir,
                                  tell_data_input_dir = tell_data_input_dir, 
                                  year_to_process = '2018')

scaled_tell_df


Unnamed: 0,Hour,BA,Load_MWh
0,1,AVA,1793.38515
1,2,AVA,1868.03540
2,3,AVA,1931.12656
3,4,AVA,1979.16283
4,5,AVA,1947.53401
...,...,...,...
245275,8756,WAUW,128.84792
245276,8757,WAUW,127.96418
245277,8758,WAUW,126.84898
245278,8759,WAUW,125.57596


## Create a Function to Format the Output for Ingest to GridView:


In [69]:
def format_scaled_tell_loads(data_input_dir: str, tell_data_input_dir: str, year_to_process: str):
    
    # Aggregate the TELL MLP files:
    scaled_tell_df = scale_tell_loads(data_input_dir = data_output_dir,
                                      tell_data_input_dir = tell_data_input_dir, 
                                      year_to_process = year_to_process)
    
    # Process the GridView file:
    gv_df = process_gridview_data(data_input_dir = data_input_dir)
    
    # Compute the load fractions for the subregions:
    CIPB_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CIPB')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    CIPV_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CIPV')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    CISC_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CISC')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    CISD_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CISD')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    VEA_LF  = (gv_df.loc[(gv_df['BA'] == 'CISO_VEA' )]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    IPFE_LF = (gv_df.loc[(gv_df['BA'] == 'IPCO_IPFE')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'IPCO')]['Total_Load_MWh'].values[0])
    IPMV_LF = (gv_df.loc[(gv_df['BA'] == 'IPCO_IPMV')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'IPCO')]['Total_Load_MWh'].values[0])
    IPTV_LF = (gv_df.loc[(gv_df['BA'] == 'IPCO_IPTV')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'IPCO')]['Total_Load_MWh'].values[0])
    NEVP_LF = (gv_df.loc[(gv_df['BA'] == 'NEVP_NEVP')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'NEVP')]['Total_Load_MWh'].values[0])
    SPPC_LF = (gv_df.loc[(gv_df['BA'] == 'NEVP_SPPC')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'NEVP')]['Total_Load_MWh'].values[0])
    PAID_LF = (gv_df.loc[(gv_df['BA'] == 'PACE_PAID')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'PACE')]['Total_Load_MWh'].values[0])
    PAUT_LF = (gv_df.loc[(gv_df['BA'] == 'PACE_PAUT')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'PACE')]['Total_Load_MWh'].values[0])
    PAWY_LF = (gv_df.loc[(gv_df['BA'] == 'PACE_PAWY')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'PACE')]['Total_Load_MWh'].values[0])
    
    # Reshape the dataframe and drop the indexes:
    output_df = scaled_tell_df.pivot(index = 'Hour', columns = 'BA', values = 'Load_MWh')
    output_df = output_df.reset_index(drop=False)
    
    # Add back in the text to the column headers:
    output_df = output_df.add_suffix("_2030.dat")
    output_df = output_df.add_prefix("Load_")
    
    # Rename the time variable:
    output_df.rename(columns={'Load_Hour_2030.dat': 'Index'}, inplace=True)
    
    # Compute the loads for the subregions:
    output_df['Load_CIPB_2030.dat'] = output_df['Load_CISO_2030.dat'] * CIPB_LF
    output_df['Load_CIPV_2030.dat'] = output_df['Load_CISO_2030.dat'] * CIPV_LF
    output_df['Load_CISC_2030.dat'] = output_df['Load_CISO_2030.dat'] * CISC_LF
    output_df['Load_CISD_2030.dat'] = output_df['Load_CISO_2030.dat'] * CISD_LF
    output_df['Load_VEA_2030.dat'] = output_df['Load_CISO_2030.dat'] * VEA_LF
    output_df['Load_IPFE_2030.dat'] = output_df['Load_IPCO_2030.dat'] * IPFE_LF
    output_df['Load_IPMV_2030.dat'] = output_df['Load_IPCO_2030.dat'] * IPMV_LF
    output_df['Load_IPTV_2030.dat'] = output_df['Load_IPCO_2030.dat'] * IPTV_LF
    output_df['Load_NEVP_Temp_2030.dat'] = output_df['Load_NEVP_2030.dat'] * NEVP_LF
    output_df['Load_SPPC_2030.dat'] = output_df['Load_NEVP_2030.dat'] * SPPC_LF
    output_df['Load_PAID_2030.dat'] = output_df['Load_PACE_2030.dat'] * PAID_LF
    output_df['Load_PAUT_2030.dat'] = output_df['Load_PACE_2030.dat'] * PAUT_LF
    output_df['Load_PAWY_2030.dat'] = output_df['Load_PACE_2030.dat'] * PAWY_LF
    
    # Drop the un-needed columns and clean up the NEVP naming:
    del output_df['Load_NEVP_2030.dat'], output_df['Load_CISO_2030.dat'], output_df['Load_IPCO_2030.dat'], output_df['Load_PACE_2030.dat']
    output_df.rename(columns={'Load_NEVP_Temp_2030.dat': 'Load_NEVP_2030.dat'}, inplace=True)
    
    # Add in a blank row and fill it with the year placeholder:
    output_df.loc[-0.5] = 0
    output_df = output_df.sort_index().reset_index(drop=True)
    output_df.iloc[0, :] = '2030'
    output_df.at[0, 'Index'] = 'Year'
    
    # Read in the raw data GridView .csv file and convert the values to floats:
    raw_gv_df = pd.read_csv((data_input_dir + '2021_1_Heatwave_Load_stress.csv'))
    
    # Subset to just the rows we need:
    raw_gv_df = raw_gv_df[0:8761]
    
    # Merge in the GridView columns that aren't modeled by TELL:
    output_df = pd.concat([output_df,raw_gv_df['Load_AESO_2030.dat']], axis=1)
    output_df = pd.concat([output_df,raw_gv_df['Load_BCHA_2030.dat']], axis=1)
    output_df = pd.concat([output_df,raw_gv_df['Load_CFE_2030.dat']], axis=1)
    output_df = pd.concat([output_df,raw_gv_df['Load_TH_Malin_2030.dat']], axis=1)
    output_df = pd.concat([output_df,raw_gv_df['Load_TH_Mead_2030.dat']], axis=1)
    output_df = pd.concat([output_df,raw_gv_df['Load_TH_PV_2030.dat']], axis=1)
    
    # Sort the data by column name then make the Index column column one:
    output_df.rename(columns={'Index': 'AA'}, inplace=True)
    output_df = output_df.sort_index(axis = 1)
    output_df.rename(columns={'AA': 'Index'}, inplace=True)
       
    # Set the output filenames:
    if year_to_process == '2018':
       output_filename = 'TELL_Loads_2021_Based_on_2018_Weather.csv'
       
    # Write out the dataframe to a .csv file:
    output_df.to_csv((os.path.join(data_input_dir, output_filename)), sep=',', index=False)
    
    # Return the output dataframe:
    return output_df


In [70]:
output_df = format_scaled_tell_loads(data_input_dir = data_output_dir,
                                     tell_data_input_dir = tell_data_input_dir,
                                     year_to_process = '2018')

output_df


Unnamed: 0,Index,Load_AESO_2030.dat,Load_AVA_2030.dat,Load_AZPS_2030.dat,Load_BANC_2030.dat,Load_BCHA_2030.dat,Load_BPAT_2030.dat,Load_CFE_2030.dat,Load_CHPD_2030.dat,Load_CIPB_2030.dat,...,Load_TEPC_2030.dat,Load_TH_Malin_2030.dat,Load_TH_Mead_2030.dat,Load_TH_PV_2030.dat,Load_TIDC_2030.dat,Load_TPWR_2030.dat,Load_VEA_2030.dat,Load_WACM_2030.dat,Load_WALC_2030.dat,Load_WAUW_2030.dat
0,Year,2030,2030,2030,2030,2030,2030,2030,2030,2030,...,2030,2030,2030,2030,2030,2030,2030,2030,2030,2030
1,1,6775,1793.38515,2572.43842,1830.77881,7171,7308.67849,1172,324.65621,4031.826431,...,1500.0119,0,0,0,231.44973,720.98883,60.236999,2797.04952,923.74756,149.1425
2,2,6726,1868.0354,2919.65012,2004.96828,6957,7768.28311,1124,329.37622,4302.659818,...,1557.41924,0,0,0,269.87269,763.22227,64.283351,2829.0536,946.94449,148.78479
3,3,6670,1931.12656,3081.51259,2046.28018,6831,8017.83831,1093,321.64409,4472.092177,...,1545.58306,0,0,0,281.44243,781.85913,66.814734,2863.52379,942.66062,148.29032
4,4,6617,1979.16283,3163.97272,2146.01144,6806,8235.06969,1078,316.91373,4670.776339,...,1527.36207,0,0,0,287.91402,795.77869,69.78315,2815.86659,944.76943,146.25981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8756,8756,7427,1975.98954,3184.2508,1977.39676,10771,8242.46317,1540,324.03515,4287.859798,...,947.51055,0,0,0,308.49343,762.72731,64.062233,2759.09029,939.16066,128.84792
8757,8757,7238,1954.60888,3161.97107,1951.5995,10524,8112.40235,1522,310.8688,4279.094705,...,952.21515,0,0,0,306.32891,752.43411,63.93128,2761.61192,936.70958,127.96418
8758,8758,7089,1941.64523,3144.35852,1939.73417,10183,8032.85695,1445,303.70597,4290.220837,...,994.62191,0,0,0,306.69149,747.37337,64.097508,2753.75363,947.82776,126.84898
8759,8759,6948,1941.29148,3163.47985,1920.80207,9709,7967.84669,1363,298.56157,4335.968566,...,1078.61399,0,0,0,308.91095,743.33286,64.780996,2754.72105,968.26446,125.57596
