In [1]:
import pandas as pd
import numpy as np
from scipy.stats import linregress as lm

In [2]:
# Define independent variable (energy demand - gload_mwh) and labels (greenhouse gas output)
LABELS = ['so2_kg', 'nox_kg', 'pm25_kg', 'co2_kg',
    'so2_dam_ap2', 'nox_dam_ap2', 'pm25_dam_ap2', 
    'so2_dam_eas', 'nox_dam_eas', 'pm25_dam_eas',
    'co2_dam']
XCOL = 'gload_mwh'

In [14]:
# Label the df with the year, month, hour, and season 
def label_temporal_groups(df):
    df = df.copy()
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['hour'] = df.index.hour

    # Convert the months to season 
    month_to_season = ['winter'] * 3 + ['trans'] + ['summer'] * 5 + ['trans'] + ['winter'] * 2
    df['season'] = df.index.map(lambda x: month_to_season[x.month - 1])
    return df

In [15]:
def calc_mef(data, cols):
    data = data.copy()
    grouped = data.groupby(cols)
    result_dict = {}

    for name, group in grouped: 
        result_dict[name] = group[LABELS].apply(lambda v: lm(group[XCOL], v))  
    
    return result_dict 

In [16]:
def format_dict_results(result_dict, cols):
    results_df = pd.DataFrame.from_dict(result_dict, orient='index')
    results_df.index.names = cols
    
    # Separate slope, stderr, intercept, rvalue into separate dataframes 
    slope_df = results_df.applymap(lambda elem: elem.slope)
    slope_df.rename(columns=lambda x: x + "-est", inplace=True)
    intercept_df = results_df.applymap(lambda elem: elem.intercept)
    intercept_df.rename(columns=lambda x: x + "-int", inplace=True)
    stderr_df = results_df.applymap(lambda elem: elem.stderr)
    stderr_df.rename(columns=lambda x: x + "-se", inplace=True)
    rvalue_df = results_df.applymap(lambda elem: elem.rvalue)
    rvalue_df.rename(columns=lambda x: x + "-r", inplace=True)
    
    # Combine results into one dataframe
    result_df = pd.concat([slope_df, intercept_df, stderr_df, rvalue_df], axis=1)
    col_order = np.array(
            ['{0}-est,{0}-se,{0}-r,{0}-int'.format(x).split(',') for x in LABELS]).flatten()
    result_df.reindex(col_order, axis=1)
    
    return result_df

### Example: Linear Regression on ISO/RTO Data, SeasonalTOD Grouping

In [20]:
# Define column and name of temporal grouping 
name = 'SeasonalTOD'
cols = ['year', 'season', 'hour', 'isorto']

In [18]:
# Load ISO/RTO data 
isorto_data1 = pd.read_csv("../data/formatted_data/cems_diffs_isorto.csv", index_col=0, parse_dates=[0])
isorto_data1.head()

Unnamed: 0_level_0,isorto,gload_mwh,so2_kg,nox_kg,pm25_kg,co2_kg,so2_dam_ap2,nox_dam_ap2,pm25_dam_ap2,so2_dam_eas,nox_dam_eas,pm25_dam_eas,co2_dam
DATE_UTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2006-01-01 04:00:00,CAISO,-457.19,-0.896751,-43.825605,-8.436821,-189575.356635,-67.47,-599.11,-2143.44,-20.13,-439.39,-704.33,-7583.02
2006-01-01 04:00:00,ERCOT,-445.7,-1853.107025,-138.292036,-20.5931,-341468.969925,-26896.37,-1389.72,-1098.43,-32415.94,-570.31,-1110.5,-13658.76
2006-01-01 04:00:00,ISONE,-142.25,-525.355244,4.911948,-13.607775,-64021.85982,-9197.51,-2.1,-686.6,-19812.15,-119.35,-1604.94,-2560.88
2006-01-01 04:00:00,MISO,-169.0,-211.712705,-208.724441,-30.753571,-161730.220245,-4707.34,-3325.18,-1928.43,1374.61,1266.81,-3062.0,-6469.21
2006-01-01 04:00:00,NYISO,10.0,-180.070127,0.725747,-3.084429,10160.472,1075.29,350.25,-51.44,-5182.87,501.45,-369.88,406.42


In [19]:
# Add columns describing temporal groups 
isorto_data1 = label_temporal_groups(isorto_data1)
isorto_data1.head()

Unnamed: 0_level_0,isorto,gload_mwh,so2_kg,nox_kg,pm25_kg,co2_kg,so2_dam_ap2,nox_dam_ap2,pm25_dam_ap2,so2_dam_eas,nox_dam_eas,pm25_dam_eas,co2_dam,year,month,hour,season
DATE_UTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2006-01-01 04:00:00,CAISO,-457.19,-0.896751,-43.825605,-8.436821,-189575.356635,-67.47,-599.11,-2143.44,-20.13,-439.39,-704.33,-7583.02,2006,1,4,winter
2006-01-01 04:00:00,ERCOT,-445.7,-1853.107025,-138.292036,-20.5931,-341468.969925,-26896.37,-1389.72,-1098.43,-32415.94,-570.31,-1110.5,-13658.76,2006,1,4,winter
2006-01-01 04:00:00,ISONE,-142.25,-525.355244,4.911948,-13.607775,-64021.85982,-9197.51,-2.1,-686.6,-19812.15,-119.35,-1604.94,-2560.88,2006,1,4,winter
2006-01-01 04:00:00,MISO,-169.0,-211.712705,-208.724441,-30.753571,-161730.220245,-4707.34,-3325.18,-1928.43,1374.61,1266.81,-3062.0,-6469.21,2006,1,4,winter
2006-01-01 04:00:00,NYISO,10.0,-180.070127,0.725747,-3.084429,10160.472,1075.29,350.25,-51.44,-5182.87,501.45,-369.88,406.42,2006,1,4,winter


In [21]:
isorto_data1 = isorto_data1.dropna()
results_dict = calc_mef(isorto_data1, cols)
print(list(results_dict.items())[0])

((2006, 'summer', 0, 'CAISO'), so2_kg          (0.0010551341284987022, -2.94649937556039, 0.0...
nox_kg          (0.06365439255722596, 1.4633463349075129, 0.52...
pm25_kg         (0.03663130146077502, -6.979862951574667, 0.75...
co2_kg          (450.13904618419656, 10183.333764701034, 0.975...
so2_dam_ap2     (0.25756925973330036, 40.63895269430316, 0.673...
nox_dam_ap2     (0.2771087234841306, 53.45939122065488, 0.4564...
pm25_dam_ap2    (5.012130655987647, -790.7193801059439, 0.6387...
so2_dam_eas     (0.03597634947626577, -60.90531032872398, 0.10...
nox_dam_eas     (0.47414516315425514, 242.68656323796097, 0.40...
pm25_dam_eas    (3.1886463624888033, 678.70051532648, 0.727575...
co2_dam         (18.00556130788681, 407.3325306686311, 0.97555...
dtype: object)


In [22]:
format_dict_results(results_dict, cols).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,so2_kg-est,nox_kg-est,pm25_kg-est,co2_kg-est,so2_dam_ap2-est,nox_dam_ap2-est,pm25_dam_ap2-est,so2_dam_eas-est,nox_dam_eas-est,pm25_dam_eas-est,...,nox_kg-r,pm25_kg-r,co2_kg-r,so2_dam_ap2-r,nox_dam_ap2-r,pm25_dam_ap2-r,so2_dam_eas-r,nox_dam_eas-r,pm25_dam_eas-r,co2_dam-r
year,season,hour,isorto,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2006,summer,0,CAISO,0.001055,0.063654,0.036631,450.139046,0.257569,0.277109,5.012131,0.035976,0.474145,3.188646,...,0.523525,0.750763,0.975559,0.673968,0.456417,0.638783,0.105391,0.40329,0.727576,0.975559
2006,summer,0,ERCOT,0.494202,0.405766,0.041762,545.599816,9.119946,3.543111,2.068164,8.345416,1.182833,2.250253,...,0.504996,0.600998,0.861425,0.178578,0.51664,0.419728,0.180878,0.510515,0.620839,0.861425
2006,summer,0,ISONE,0.72449,0.227992,0.055096,491.321493,20.498162,0.115943,5.167214,26.42584,3.6899,7.325081,...,0.369057,0.821572,0.935535,0.443231,0.363031,0.763807,0.458045,0.407554,0.781493,0.935535
2006,summer,0,MISO,1.133587,0.640339,0.083568,591.090359,45.256387,5.740921,6.5,32.837607,7.029787,7.758004,...,0.502024,0.857657,0.828381,0.493671,0.339023,0.804741,0.411473,0.509314,0.850584,0.828381
2006,summer,0,NYISO,0.759523,0.627706,0.078194,487.677989,42.726998,4.096873,36.006505,26.570859,13.079756,14.704445,...,0.769651,0.92701,0.953787,0.639944,0.65862,0.781687,0.510462,0.776394,0.948597,0.953787


### Linear Regression for all temporal groupings

In [63]:
grouping_names = ["SeasonalTOD", "MonthTOD", "TOD", "YearOnly", "Month"]
grouping_cols = [['year', 'season', 'hour'], ['year', 'month', 'hour'], ['year', 'hour'], ['year'], ['year', 'month']]

In [64]:
# Load ISO/RTO data 
isorto_data = pd.read_csv("../data/formatted_data/cems_diffs_isorto.csv", index_col=0, parse_dates=[0])

# Deep copy the grouping cols array
cols_isorto = [row[:] for row in grouping_cols]
for c in cols_isorto:
    c.append('isorto')

In [65]:
# Calculate for ISO/RTO
for grouping_name, grouping in zip(grouping_names, cols_isorto):
    isorto_data = label_temporal_groups(isorto_data)
    isorto_data = isorto_data.dropna()
    results_dict = calc_mef(isorto_data, grouping)
    result_isorto_df = format_dict_results(results_dict, grouping)
    result_isorto_df.to_csv("results/isorto/" + grouping_name + ".csv")

In [66]:
# Load NERC Data 
nerc_data = pd.read_csv("../data/formatted_data/cems_diffs_nerc.csv", index_col=0, parse_dates=[0])

cols_nerc = [row[:] for row in grouping_cols]
for c in cols_nerc:
    c.append('nerc')

In [67]:
# Calculate for NERC
for grouping_name, grouping in zip(grouping_names, cols_nerc):
    nerc_data = label_temporal_groups(nerc_data)
    nerc_data = nerc_data.dropna()
    results_dict = calc_mef(nerc_data, grouping)
    result_nerc_df = format_dict_results(results_dict, grouping)
    result_nerc_df.to_csv("results/nerc/" + grouping_name + ".csv")