# Create files with index and generation data at state level

## Instructions
Make sure the `file_date` parameter below is set to whatever value you would like appended to file names.

The entire notebook can be run at once using *Run All Cells*

In [48]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
from os.path import join
import glob
import numpy as np
from joblib import Parallel, delayed
import sys
import json
cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')
idx = pd.IndexSlice

from src.params import (
    DATA_DATE,
    DATA_PATHS,
    STATES,
    STATE_FACILITY_FUELS,
    CUSTOM_FUELS,
)

In [2]:
file_date = '2019-03-05'

In [3]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

In [4]:
%aimport src.Analysis.index
from src.Analysis.index import facility_emission_gen, group_facility_data
%aimport src.Analysis.index
from src.Analysis.index import facility_co2, adjust_epa_emissions, group_fuel_cats
%aimport src.Analysis.index
from src.Analysis.index import extra_emissions_gen, add_datetime, add_quarter
%aimport src.util
from src.util import rename_cols, add_facility_location

In [5]:
# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [6]:
%aimport Analysis.index
from Analysis.index import facility_emission_gen, group_facility_data
%aimport Analysis.index
from Analysis.index import facility_co2, adjust_epa_emissions, group_fuel_cats
%aimport Analysis.index
from Analysis.index import extra_emissions_gen, add_datetime, add_quarter
%aimport src.util
from src.util import rename_cols, add_facility_location

In [5]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE",
          "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS",
          "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
          "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
          "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

## Load data

Emission factors

In [8]:
# path = join(data_path, 'Final emission factors.csv')
path = DATA_PATHS['inputs'] / 'Final emission factors.csv'
ef = pd.read_csv(path, index_col=0)

EIA facility data and EPA monthly emissions

In [15]:
# facility_path = join(data_path, 'Derived data',
#                      'Facility gen fuels and CO2 {}.csv'.format(file_date))
facility_path = (
    DATA_PATHS['eia_compiled']
    / 'facility_gen_fuel_data_{}.csv'.format(DATA_DATE)
)
facility_df = pd.read_csv(facility_path)
facility_df['state'] = facility_df.geography.str[-2:]
rename_cols(facility_df)

# epa_path = join(data_path, 'Derived data',
#                 'Monthly EPA emissions {}.csv'.format(file_date))
epa_path = (
    DATA_PATHS['epa_emissions']
    / 'epa_emissions_{}.csv'.format(DATA_DATE)
)
epa_df = pd.read_csv(epa_path)
rename_cols(epa_df)
# facility_locations = pd.read_csv(join(data_path, 'Facility labels',
#                                       'Facility locations.csv'))

facility_locations = facility_df.loc[:, ['plant id', 'state']].drop_duplicates()
# Add state labels to the EPA facilities
epa_df = add_facility_location(epa_df, facility_locations, labels=['state'])

In [23]:
facility_df.head()

Unnamed: 0,f,fuel,month,plant id,total fuel (mmbtu),year,generation (mwh),elec fuel (mmbtu),geography,last_updated,lat,lon,prime mover,datetime,quarter,all fuel fossil co2 (kg),elec fuel fossil co2 (kg),all fuel total co2 (kg),elec fuel total co2 (kg),state
0,M,OBL,12,10120,1260.0,2017,0.0,38.0,USA-GA,2019-03-01T00:40:06-05:00,31.164772,-81.478724,ALL,2017-12-01,4,0.0,0.0,105814.8,3191.24,GA
1,M,OBL,11,10120,1167.0,2017,0.0,36.0,USA-GA,2019-03-01T00:40:06-05:00,31.164772,-81.478724,ALL,2017-11-01,4,0.0,0.0,98004.66,3023.28,GA
2,M,OBL,10,10120,1414.0,2017,0.0,43.0,USA-GA,2019-03-01T00:40:06-05:00,31.164772,-81.478724,ALL,2017-10-01,4,0.0,0.0,118747.72,3611.14,GA
3,M,OBL,9,10120,1425.0,2017,0.0,44.0,USA-GA,2019-03-01T00:40:06-05:00,31.164772,-81.478724,ALL,2017-09-01,3,0.0,0.0,119671.5,3695.12,GA
4,M,OBL,8,10120,1770.0,2017,0.0,54.0,USA-GA,2019-03-01T00:40:06-05:00,31.164772,-81.478724,ALL,2017-08-01,3,0.0,0.0,148644.6,4534.92,GA


In [16]:
epa_df.head()

Unnamed: 0,year,month,plant id,gload_mwh,heatinput_mmbtu,co2mass_kg,noxmass_kg,so2mass_kg,state
0,2001,1,3,1167292.0,11130000.0,962541100.0,1871380.0,5047711.0,AL
1,2001,1,5,0.0,0.0,0.0,0.0,0.0,
2,2001,1,7,0.0,719804.7,67000920.0,210480.5,838948.1,AL
3,2001,1,8,746506.0,8089974.0,753020400.0,1842946.0,5151142.0,AL
4,2001,1,10,371304.0,3860362.0,352903400.0,909667.7,3701280.0,AL


In [17]:
facility_locations.head()

Unnamed: 0,plant id,state
0,10120,GA
132,10219,NY
336,10129,PA
552,10123,NJ
948,10124,NY


JSON files with fuel categories

In [10]:
fuel_cat_folder = join(data_path, 'Fuel categories')
state_cats_path = join(fuel_cat_folder, 'State_facility.json')

with open(state_cats_path, 'r') as f:
    state_fuel_cat = json.load(f)
    
custom_cats_path = join(fuel_cat_folder, 'Custom_results.json')
with open(custom_cats_path, 'r') as f:
    custom_fuel_cat = json.load(f)

EIA total monthly gen and fuel consumption

In [42]:
# path = join(data_path, 'Derived data',
#             'EIA state-level gen fuel CO2 {}.csv'.format(file_date))
path = (
    DATA_PATHS['eia_compiled']
    / 'state_gen_fuel_data_{}.csv'.format(DATA_DATE)
)
eia_totals = pd.read_csv(path, parse_dates=['datetime'])
rename_cols(eia_totals)
eia_totals['state'] = eia_totals.geography.str[-2:]


# Remove fuel categories that are duplicated with other categories
eia_totals = eia_totals.loc[~eia_totals.type.isin(['SPV', 'AOR', 'TSN'])]

In [39]:
eia_totals.head()

Unnamed: 0,type,year,month,geography,end,f,last_updated,sector,series_id,start,units,generation (mwh),total fuel (mmbtu),elec fuel (mmbtu),all fuel co2 (kg),elec fuel co2 (kg),datetime,quarter,state
10728,COW,2001,1,USA-AK,201812.0,M,2019-03-01T00:40:06-05:00,99.0,ELEC.GEN.COW-AK-99.M,200101.0,thousand megawatthours,46903.0,1120.0,872.0,106680.0,83058.0,2001-01-01,1,AK
10729,COW,2001,1,USA-AL,201812.0,M,2019-03-01T00:40:06-05:00,99.0,ELEC.GEN.COW-AL-99.M,200101.0,thousand megawatthours,6557913.0,67999.0,66582.0,6476904.75,6341935.5,2001-01-01,1,AL
10730,COW,2001,1,USA-AR,201812.0,M,2019-03-01T00:40:06-05:00,99.0,ELEC.GEN.COW-AR-99.M,200101.0,thousand megawatthours,2149808.0,23099.0,22700.0,2200179.75,2162175.0,2001-01-01,1,AR
10731,COW,2001,1,USA-AZ,201812.0,M,2019-03-01T00:40:06-05:00,99.0,ELEC.GEN.COW-AZ-99.M,200101.0,thousand megawatthours,3418454.0,35873.0,35483.0,3416903.25,3379755.75,2001-01-01,1,AZ
10732,COW,2001,1,USA-CA,201812.0,M,2019-03-01T00:40:06-05:00,99.0,ELEC.GEN.COW-CA-99.M,200101.0,thousand megawatthours,199857.0,3652.0,2008.0,347853.0,191262.0,2001-01-01,1,CA


## Calculate state-level monthly CO₂ intensity and generation by fuel category

In [43]:
index_list = []
gen_list = []
for state in STATES:
    eia_fac_state = facility_df.loc[facility_df.state == state].copy()
    eia_totals_state = eia_totals.loc[eia_totals.state == state].copy()
    epa_state = epa_df.loc[epa_df.state == state].copy()
    

    co2, gen_fuels_state = facility_emission_gen(eia_facility=eia_fac_state,
                                                 epa=epa_state,
                                                 state_fuel_cat=STATE_FACILITY_FUELS,
                                                 custom_fuel_cat=CUSTOM_FUELS,
                                                 export_state_cats=True,
                                                 print_status=False)
    
    extra_co2, extra_gen = extra_emissions_gen(gen_fuels_state,
                                               eia_totals_state, ef)
    
    # Combine facility and extra co2, name the series
    co2_monthly = co2.groupby(['year', 'month']).sum()
    total_co2 = (co2_monthly.loc[:, 'final co2 (kg)']
                 + extra_co2.loc[:, 'elec fuel co2 (kg)']
                            .groupby(['year', 'month']).sum())
    total_co2.name = 'final co2 (kg)'
    
    # Total gen, and the co2 intensity
    total_gen = (eia_totals_state
                 .groupby(['year', 'month'])['generation (mwh)'].sum())
    
    state_index = pd.concat([total_co2, total_gen], axis=1)
    state_index['index (g/kwh)'] = (state_index['final co2 (kg)']
                                    / state_index['generation (mwh)'])
    state_index['state'] = state
    state_index.set_index('state', append=True, inplace=True)
    
    # Generation by fuel category
    gen_category = group_fuel_cats(eia_totals_state, CUSTOM_FUELS, 
                                   fuel_col='type', new_col='fuel category')
    
    keep_cols = ['fuel category', 'generation (mwh)', 'total fuel (mmbtu)',
                 'elec fuel (mmbtu)', 'all fuel co2 (kg)',
                 'elec fuel co2 (kg)', 'year', 'month']
    gen_category = gen_category[keep_cols]
    gen_category['state'] = state
    gen_category.set_index(['year', 'month', 'state'], inplace=True)
    
    # Add each df to the list
    index_list.append(state_index)
    gen_list.append(gen_category)


# Combine lists of dataframes
state_index_all = pd.concat(index_list)
add_quarter(state_index_all)

gen_category_all = pd.concat(gen_list)
add_quarter(gen_category_all)

In [45]:
state_index.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final co2 (kg),generation (mwh),index (g/kwh)
year,month,state,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,12,WY,3997085000.0,4355714.14,917.664711
2014,3,WY,3778705000.0,4296310.3,879.523224
2012,2,WY,4098590000.0,4302824.2,952.534892
2015,1,WY,4034519000.0,4408320.44,915.205413
2010,7,WY,4132130000.0,4321521.01,956.174939


In [44]:
state_index_all.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final co2 (kg),generation (mwh),index (g/kwh),datetime,quarter
year,month,state,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014,11,NJ,1350770000.0,5449136.01,247.887069,2014-11-01,4
2008,1,PA,10745930000.0,19963764.12,538.271533,2008-01-01,1
2005,9,NJ,1756584000.0,5295248.83,331.728279,2005-09-01,3
2004,2,MN,3265441000.0,4326062.0,754.829974,2004-02-01,1
2018,9,HI,632262500.0,970797.8,651.281367,2018-09-01,3


In [46]:
gen_category_all.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fuel category,generation (mwh),total fuel (mmbtu),elec fuel (mmbtu),all fuel co2 (kg),elec fuel co2 (kg),datetime,quarter
year,month,state,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014,6,MO,Hydro,80191.09,0.0,0.0,0.0,0.0,2014-06-01,2
2005,2,GA,Natural Gas,163463.14,1813320.0,1203920.0,96232890.0,63892030.0,2005-02-01,1
2001,2,AZ,Solar,0.0,0.0,0.0,0.0,0.0,2001-02-01,1
2007,5,IL,Coal,6480593.22,73150930.0,68833940.0,6967626000.0,6556433000.0,2007-05-01,2
2014,8,ID,Natural Gas,389868.22,2940250.0,2830100.0,156039100.0,150193400.0,2014-08-01,3


In [50]:
# output state results to file
index_fn = 'state_index_{}.csv'.format(DATA_DATE)
gen_fn = 'state_generation_{}.csv'.format(DATA_DATE)
    
state_index_all.to_csv(DATA_PATHS['results'] / index_fn)
gen_category_all.to_csv(DATA_PATHS['results'] / gen_fn)

# state_index_all.to_csv(DATA_PATHS['data'] / 'results' / index_fn)
# gen_category_all.to_csv(DATA_PATHS['data'] / 'results' / gen_fn)