# Process the Load Data for the NTP Heat Wave Grid Stress Events


In [128]:
# Start by importing the packages we need:
import os
import glob
import datetime

import pandas as pd
import numpy as np


## Set the Directory Structure

In [4]:
# Identify the data input and image output directories:
tell_data_input_dir =  '/Users/burl878/Documents/Code/code_repos/ntp_heat_wave_loads/data/TELL_Data/'
data_output_dir =  '/Users/burl878/Documents/Code/code_repos/ntp_heat_wave_loads/data/'


## Create a Function to Process the 2035 GridView Data Used in Scaling


In [239]:
def process_gridview_data(data_input_dir: str):
    # Check to see if the processed output file exist and if not then create it:
    if os.path.isfile((os.path.join(data_input_dir, 'Processed_Gridview_2035_Data.csv'))) == False:
       
       # Read in the raw data .csv file:
       gv_df = pd.read_csv((data_input_dir + 'wecc_load_2035.csv'))

       # Subset to just the annual total demand by BA:
       gv_df = gv_df[-3:-2]
       
       # Rename a few columns for consistency:
       gv_df.rename(columns={'Load_CIPB_2030_CEC.dat': 'Load_CIPB_2030.dat', 
                             'Load_CIPV_2030_CEC.dat': 'Load_CIPV_2030.dat',
                             'Load_CISC_2030_CEC.dat': 'Load_CISC_2030.dat',
                             'Load_CISD_2030_CEC.dat': 'Load_CISD_2030.dat'}, inplace=True)

       # Strip the unecessary bits from the column names:
       gv_df.columns = gv_df.columns.str.strip('_2030.dat')
       gv_df.columns = gv_df.columns.str.lstrip('Load_')  
        
       # Delete the index and last column:
       del gv_df["Index"], gv_df["Unnamed: 38"]
    
       # Compute the total loads for CISO, IPCO, NEVP, and PACE:
       gv_df['CISO'] = (gv_df['CIPB'] + gv_df['CIPV'] + gv_df['CISC'] + gv_df['CISD'] + gv_df['VEA']).round(2)
       gv_df['IPCO'] = (gv_df['IPFE'] + gv_df['IPMV'] + gv_df['IPTV']).round(2)
       gv_df['PACE'] = (gv_df['PAID'] + gv_df['PAUT'] + gv_df['PAWY']).round(2)
       gv_df['NEVP_Sum'] = (gv_df['NEVP'] + gv_df['SPPC']).round(2)
           
       # Rename a few columns for consistency:
       gv_df.rename(columns={'CIPB': 'CISO_CIPB', 'CIPV': 'CISO_CIPV', 'CISC': 'CISO_CISC', 'CISD': 'CISO_CISD', 'VEA': 'CISO_VEA',
                             'IPFE': 'IPCO_IPFE', 'IPMV': 'IPCO_IPMV', 'IPTV': 'IPCO_IPTV',
                             'NEVP': 'NEVP_NEVP', 'SPPC': 'NEVP_SPPC',
                             'PAID': 'PACE_PAID', 'PAUT': 'PACE_PAUT', 'PAWY': 'PACE_PAWY'}, inplace=True) 
       gv_df.rename(columns={'NEVP_Sum': 'NEVP'}, inplace=True) 
    
       # Squeeze the dataframe:
       gv_df = gv_df.squeeze().to_frame()
        
       # Rename the columns:
       gv_df.reset_index(inplace=True)
       gv_df = gv_df.rename(columns = {'index':'BA'})
       gv_df.rename(columns={gv_df.columns[1]: "Total_Load_MWh" }, inplace = True)
       
       # Sort the dataframe alphabetically by BA name:
       gv_df = gv_df.sort_values('BA')
       
       # Write out the dataframe to a .csv file:
       gv_df.to_csv((os.path.join(data_input_dir, 'Processed_Gridview_2035_Data.csv')), sep=',', index=False)
    
    else:
       # Read in the already processed output file:
       gv_df = pd.read_csv((os.path.join(data_input_dir, 'Processed_Gridview_2035_Data.csv')))       
    
    # Return the output dataframe:
    return gv_df


In [240]:
gv_df = process_gridview_data(data_input_dir = data_output_dir)

gv_df


Unnamed: 0,BA,Total_Load_MWh
0,AVA,17003600.0
1,AZPS,45766230.0
2,BANC,24290270.0
3,BPAT,65718760.0
4,CHPD,2342334.0
37,CISO,334092400.0
5,CISO_CIPB,55833700.0
6,CISO_CIPV,66272990.0
7,CISO_CISC,173577800.0
8,CISO_CISD,37223910.0


## Create a Function to Aggregate the Raw TELL MLP Output into a Single Dataframe:


In [187]:
def aggregate_mlp_output_files(tell_data_input_dir: str, year_to_process: str):
    
    # Create a list of all of the MLP output files in the "mlp_input_dir" and aggregate the files in that list:
    list_of_files = sorted(glob.glob(os.path.join(tell_data_input_dir, year_to_process, '*_mlp_output.csv')))

    # Loop over the list of MLP output files:
    for file in range(len(list_of_files)):

        # Read in the .csv file and replace missing values with nan:
        mlp_data = pd.read_csv(list_of_files[file]).replace(-9999, np.nan)

        # Rename the "Load" variable:
        mlp_data.rename(columns={'Load': 'Hourly_Load_MWh'}, inplace=True)

        # Replacing missing or negative loads with NaN:
        mlp_data.loc[~(mlp_data['Hourly_Load_MWh'] > 0), 'Hourly_Load_MWh'] = np.nan

        # Aggregate the output into a new dataframe:
        if file == 0:
            tell_df = mlp_data
        else:
            tell_df = pd.concat([tell_df, mlp_data])
    
    # Return the output dataframe:
    return tell_df


## Create a Function to Scale the TELL Output Based on the GridView 2035 Values:


In [188]:
def scale_tell_loads(data_input_dir: str, tell_data_input_dir: str, year_to_process: str):
    
    # Aggregate the TELL MLP files:
    tell_df = aggregate_mlp_output_files(tell_data_input_dir = tell_data_input_dir,
                                         year_to_process = year_to_process)
    
    # Read in the processed GridView file and rename a column for consistency:
    gv_df = pd.read_csv((os.path.join(data_input_dir, 'Processed_Gridview_2035_Data.csv')))
    gv_df.rename(columns={'Total_Load_MWh': 'GV_Total_Load_MWh'}, inplace=True) 
    
    # Merge the tell_df and gv_df dataframes based on common BA names:
    merged_df = tell_df.merge(gv_df, on=['BA'])
    
    # Sum the hourly TELL loads by BA into annual total loads:
    merged_df['TELL_Total_Load_MWh'] = merged_df.groupby('BA')['Hourly_Load_MWh'].transform('sum')
    
    # Compute the scaling factors that force the annual total loads to agree:
    merged_df['Scaling_Factor'] = merged_df['GV_Total_Load_MWh'] / merged_df['TELL_Total_Load_MWh']
    
    # Compute the scaled hourly loads:
    merged_df['Hourly_Load_MWh_Scaled'] = merged_df['Hourly_Load_MWh'] * merged_df['Scaling_Factor']
    
    # Compute the hours since the start of the year:
    merged_df['Hour'] = ((pd.to_datetime(merged_df['Time_UTC']) - datetime.datetime(int(year_to_process), 1, 1, 0, 0, 0)) / np.timedelta64(1, 'h') + 1).astype(int)
    
    # Only keep the columns that are needed:
    scaled_tell_df = merged_df[['Hour', 'BA', 'Hourly_Load_MWh_Scaled']].copy()
    
    # Drop the rows with missing values (i.e., there is not a corresponding GridView load):
    scaled_tell_df = scaled_tell_df.dropna(how = 'any')
    
    # Rename the load variable and round it to 5 decimals:
    scaled_tell_df.rename(columns={'Hourly_Load_MWh_Scaled': 'Load_MWh'}, inplace=True)
    scaled_tell_df['Load_MWh'] = scaled_tell_df['Load_MWh'].round(5)
    
    # Return the output dataframe:
    return scaled_tell_df


## Create a Function to Format the Output for Ingest to GridView:


In [322]:
def format_scaled_tell_loads(data_input_dir: str, tell_data_input_dir: str, year_to_process: str):
    
    # Aggregate the TELL MLP files:
    scaled_tell_df = scale_tell_loads(data_input_dir = data_output_dir,
                                      tell_data_input_dir = tell_data_input_dir, 
                                      year_to_process = year_to_process)
    
    # Read in the processed GridView file and rename a column for consistency:
    gv_df = pd.read_csv((os.path.join(data_input_dir, 'Processed_Gridview_2035_Data.csv')))
    
    # Compute the load fractions for the subregions:
    CIPB_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CIPB')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    CIPV_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CIPV')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    CISC_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CISC')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    CISD_LF = (gv_df.loc[(gv_df['BA'] == 'CISO_CISD')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    VEA_LF  = (gv_df.loc[(gv_df['BA'] == 'CISO_VEA' )]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'CISO')]['Total_Load_MWh'].values[0])
    IPFE_LF = (gv_df.loc[(gv_df['BA'] == 'IPCO_IPFE')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'IPCO')]['Total_Load_MWh'].values[0])
    IPMV_LF = (gv_df.loc[(gv_df['BA'] == 'IPCO_IPMV')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'IPCO')]['Total_Load_MWh'].values[0])
    IPTV_LF = (gv_df.loc[(gv_df['BA'] == 'IPCO_IPTV')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'IPCO')]['Total_Load_MWh'].values[0])
    NEVP_LF = (gv_df.loc[(gv_df['BA'] == 'NEVP_NEVP')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'NEVP')]['Total_Load_MWh'].values[0])
    SPPC_LF = (gv_df.loc[(gv_df['BA'] == 'NEVP_SPPC')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'NEVP')]['Total_Load_MWh'].values[0])
    PAID_LF = (gv_df.loc[(gv_df['BA'] == 'PACE_PAID')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'PACE')]['Total_Load_MWh'].values[0])
    PAUT_LF = (gv_df.loc[(gv_df['BA'] == 'PACE_PAUT')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'PACE')]['Total_Load_MWh'].values[0])
    PAWY_LF = (gv_df.loc[(gv_df['BA'] == 'PACE_PAWY')]['Total_Load_MWh'].values[0]) / (gv_df.loc[(gv_df['BA'] == 'PACE')]['Total_Load_MWh'].values[0])
    
    # Reshape the dataframe and drop the indexes:
    output_df = scaled_tell_df.pivot(index = 'Hour', columns = 'BA', values = 'Load_MWh')
    output_df = output_df.reset_index(drop=False)
    
    # Add back in the text to the column headers:
    output_df = output_df.add_suffix('_2030.dat')
    output_df = output_df.add_prefix('Load_')
    
    # Rename the time variable:
    output_df.rename(columns={'Load_Hour_2030.dat': 'Index'}, inplace=True)
    
    # Compute the loads for the subregions:
    output_df['Load_CIPB_2030_CEC.dat'] = output_df['Load_CISO_2030.dat'] * CIPB_LF
    output_df['Load_CIPV_2030_CEC.dat'] = output_df['Load_CISO_2030.dat'] * CIPV_LF
    output_df['Load_CISC_2030_CEC.dat'] = output_df['Load_CISO_2030.dat'] * CISC_LF
    output_df['Load_CISD_2030_CEC.dat'] = output_df['Load_CISO_2030.dat'] * CISD_LF
    output_df['Load_VEA_2030.dat'] = output_df['Load_CISO_2030.dat'] * VEA_LF
    
    output_df['Load_IPFE_2030.dat'] = output_df['Load_IPCO_2030.dat'] * IPFE_LF
    output_df['Load_IPMV_2030.dat'] = output_df['Load_IPCO_2030.dat'] * IPMV_LF
    output_df['Load_IPTV_2030.dat'] = output_df['Load_IPCO_2030.dat'] * IPTV_LF
    
    output_df['Load_NEVP_Temp_2030.dat'] = output_df['Load_NEVP_2030.dat'] * NEVP_LF
    output_df['Load_SPPC_2030.dat'] = output_df['Load_NEVP_2030.dat'] * SPPC_LF
    
    output_df['Load_PAID_2030.dat'] = output_df['Load_PACE_2030.dat'] * PAID_LF
    output_df['Load_PAUT_2030.dat'] = output_df['Load_PACE_2030.dat'] * PAUT_LF
    output_df['Load_PAWY_2030.dat'] = output_df['Load_PACE_2030.dat'] * PAWY_LF
    
    # Drop the un-needed columns and clean up the NEVP naming:
    del output_df['Load_NEVP_2030.dat'], output_df['Load_CISO_2030.dat'], output_df['Load_IPCO_2030.dat'], output_df['Load_PACE_2030.dat']
    output_df.rename(columns={'Load_NEVP_Temp_2030.dat': 'Load_NEVP_2030.dat'}, inplace=True)
    
    # Sort the data by column name then make the Index column column one:
    output_df.rename(columns={'Index': 'AA'}, inplace=True)
    output_df = output_df.sort_index(axis = 1)
    output_df.rename(columns={'AA': 'Index'}, inplace=True)
    
    # Add in a blank row and fill it with the year placeholder:
    output_df = output_df.shift(periods=1)
    output_df.iloc[0, :] = int(2030)
    output_df.at[0, 'Index'] = 'Year'
    
    # Set the output filenames:
    if year_to_process == '2055':
       output_filename = 'TELL_Loads_2035_Based_on_2015_Weather.csv'
    if year_to_process == '2058':
       output_filename = 'TELL_Loads_2035_Based_on_2018_Weather.csv'
    
    # Write out the dataframe to a .csv file:
    output_df.to_csv((os.path.join(data_input_dir, output_filename)), sep=',', index=False)
    
    # Return the output dataframe:
    return output_df


In [323]:
output_df = format_scaled_tell_loads(data_input_dir = data_output_dir,
                                     tell_data_input_dir = tell_data_input_dir,
                                     year_to_process = '2055')

output_df


BA,Index,Load_AVA_2030.dat,Load_AZPS_2030.dat,Load_BANC_2030.dat,Load_BPAT_2030.dat,Load_CHPD_2030.dat,Load_CIPB_2030_CEC.dat,Load_CIPV_2030_CEC.dat,Load_CISC_2030_CEC.dat,Load_CISD_2030_CEC.dat,...,Load_SCL_2030.dat,Load_SPPC_2030.dat,Load_SRP_2030.dat,Load_TEPC_2030.dat,Load_TIDC_2030.dat,Load_TPWR_2030.dat,Load_VEA_2030.dat,Load_WACM_2030.dat,Load_WALC_2030.dat,Load_WAUW_2030.dat
0,Year,2030.00000,2030.00000,2030.00000,2030.00000,2030.00000,2030.000000,2030.000000,2030.000000,2030.000000,...,2030.00000,2030.000000,2030.00000,2030.00000,2030.00000,2030.00000,2030.000000,2030.00000,2030.00000,2030.00000
1,1.0,2581.04005,4170.07631,2058.86477,9176.21735,444.76021,5006.194224,5942.207823,15563.433905,3337.592142,...,1597.71426,1355.837330,4263.56744,2213.48176,285.39075,811.88677,106.156755,4544.68112,1167.21660,144.13470
2,2.0,2659.71934,4465.28562,2301.49554,9767.48553,456.41707,5435.577831,6451.873751,16898.316867,3623.858972,...,1685.13341,1467.295644,4578.82665,2261.06910,332.34519,869.30922,115.261870,4619.65375,1260.00728,143.51794
3,3.0,2728.42821,4671.65638,2385.47040,10104.38485,449.98194,5749.978764,6825.058570,17875.737624,3833.467716,...,1738.68583,1524.015169,4806.70014,2249.11969,340.96738,890.85552,121.928767,4694.21692,1303.70770,142.37251
4,4.0,2785.34089,4808.69913,2524.49427,10355.79537,440.71099,6091.450700,7230.375887,18937.317672,4061.124495,...,1771.66548,1580.734697,4878.41691,2208.73618,348.34685,902.99751,129.169707,4648.29024,1325.82030,140.93758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,8755.0,2664.35509,3556.27258,2476.31978,10317.62792,432.20353,5006.769766,5942.890975,15565.223169,3337.975851,...,1572.88839,1311.827866,4426.77519,1461.04966,326.84902,855.47193,106.168960,4351.26598,1165.91169,127.02885
8756,8756.0,2679.50327,3360.33963,2455.16742,10059.98548,420.87388,5033.747129,5974.912325,15649.091352,3355.961457,...,1581.46747,1309.836017,4352.73942,1433.19398,321.40410,846.84654,106.741017,4323.31635,1153.61430,126.68900
8757,8757.0,2645.22729,3284.07608,2414.63421,9878.26572,416.98826,5058.422327,6004.201071,15725.802480,3372.412226,...,1588.58836,1306.048545,4257.01782,1425.66151,319.09801,836.33111,107.264256,4294.56099,1160.77714,125.70721
8758,8758.0,2627.06943,3247.99188,2363.97432,9720.07932,414.13881,5065.139260,6012.173877,15746.684319,3376.890355,...,1598.13957,1305.189608,4141.60036,1431.37988,315.65170,828.83974,107.406690,4266.94158,1174.96098,124.20935
