## Aggregate CEMS data at the utility-plant level for RMI

In [3]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import geopandas as gpd
import geoplot as gplt
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
import re


# Local libraries
import pudl

In [4]:
from pudl.workspace.setup import PudlPaths


ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))

pudl_engine = sa.create_engine(PudlPaths().pudl_db())
#display(pudl_engine)

#pudl_engine.table_names()
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine)

##### Identify the utilities you'd like to get information on

In [39]:
fpl_id = 6452
duke_id = 5416

util_id = duke_id

##### Read the Master Unit List (MUL) for utility fraction owned data

In [5]:
# Read master unit list
mul = pd.read_pickle('/Users/aesharpe/Desktop/Work/Catalyst_Coop/master_unit_list.pkl.gz')

In [6]:
# Get the ownership fractions at the generator level
gen_mul = mul[(mul['plant_part']=='plant_gen') & (mul['ownership']=='owned')].copy()
gen_mul = (
    gen_mul.rename(columns={'report_year': 'year'})
    .reset_index()
    [['plant_id_eia', 'generator_id', 'plant_name_eia', 
      'year', 'fraction_owned', 'utility_id_eia', 'net_generation_mwh', 
      'capacity_mw', 'fuel_type_code_pudl']]
    .drop_duplicates()
)

In [7]:
# Combine with EPA-EIA mapping
eia_epa_map = pd.read_csv('/Users/aesharpe/Desktop/Work/Catalyst_Coop/EPA-EIA-Unit-Crosswalk/eia_epa_id_crosswalk.csv')
eia_epa = eia_epa_map[['plant_id_epa', 'plant_id_eia', 'unitid', 'generator_id', 'fuel_type_primary']].copy()

gen_mul_map = pd.merge(gen_mul, eia_epa, on=['plant_id_eia', 'generator_id'], how='outer')

In [8]:
gen_mul_fracs = (
    gen_mul_map.assign(
        net_gen_plant_sum=(
            lambda x: x.groupby(['utility_id_eia', 'plant_id_eia', 'year']).net_generation_mwh.transform('sum', min_count=1)),
        net_gen_unit_sum=(
            lambda x: x.groupby(['utility_id_eia', 'unitid', 'year']).net_generation_mwh.transform('sum', min_count=1)),
        cap_plant_sum=(
            lambda x: x.groupby(['utility_id_eia', 'plant_id_eia', 'year']).capacity_mw.transform('sum', min_count=1)),
        cap_unit_sum=(
            lambda x: x.groupby(['utility_id_eia', 'unitid', 'year']).capacity_mw.transform('sum', min_count=1)),
        fraction_owned=lambda x: x.fraction_owned.fillna(1),
        fraction_owned_cap_plant=lambda x: x.fraction_owned * x.capacity_mw / x.cap_plant_sum,
        fraction_owned_cap_unit=lambda x: x.fraction_owned * x.capacity_mw / x.cap_unit_sum,
        plant_id_eia=lambda x: x.plant_id_eia.astype('Int64'),
        year=lambda x: x.year.astype('Int64')
    )
)

In [40]:
# Only keeps entries for the given utility
util_gen_mul_fracs = gen_mul_fracs[gen_mul_fracs['utility_id_eia']==util_id]
util_plants = list(set((util_gen_mul_fracs.plant_id_eia)))

##### Prep CEMS for Utility integration (only run this to make new pickle files!

In [18]:
# # CEMS
# years = range(2009, 2020)
# cems_df = pd.DataFrame()
# client = Client()

# my_cols = [
#     'state',
#     'plant_id_eia', 
#     'unitid',
#     'gross_load_mw',
#     'operating_datetime_utc',
# ]

# for yr in years:
#     print(f'starting calculation for {yr}')
#     epacems_path = (PudlPaths().output_dir + f'/epacems/year={yr}')
#     cems_dd = (
#         dd.read_parquet(epacems_path, columns=my_cols)
#         .assign(state=lambda x: x['state'].astype('string'))
#     )
#     cems_dd_util = cems_dd[cems_dd['plant_id_eia'].isin(util_plants)]
    
#     cems_df_util = (
#         client.compute(cems_dd_util)
#         .result()
#         .assign(year=yr))
#     cems_df = (
#         pd.concat([cems_df, cems_df_util])
#         #.rename(columns={'unitid': 'generator_id'})
#     )

Perhaps you already have a cluster running?
Hosting the HTTP server on port 54231 instead


starting calculation for 2009
starting calculation for 2010
starting calculation for 2011
starting calculation for 2012
starting calculation for 2013
starting calculation for 2014
starting calculation for 2015
starting calculation for 2016
starting calculation for 2017
starting calculation for 2018
starting calculation for 2019


In [19]:
#cems_df.to_pickle('/Users/aesharpe/Desktop/duke_cems.pkl')
#cems_df.to_pickle('/Users/aesharpe/Desktop/fpl_cems.pkl')

In [5]:
# Load CEMS pickle files
cems_duke = pd.read_pickle('/Users/aesharpe/Desktop/duke_cems.pkl')
cems_fpl = pd.read_pickle('/Users/aesharpe/Desktop/fpl_cems.pkl')

In [6]:
# Determine which utility to use
cems_df = cems_duke

In [43]:
tech_df = pd.read_csv('/Users/aesharpe/Desktop/epa_ampd_annual_emissions_data.csv')
col_list = list(tech_df.columns)
col_list = [col.replace(' ', '_').lower() for col in col_list]
col_list = [col.replace('(', '') for col in col_list]
col_list = [col.replace(')', '') for col in col_list]
tech_df.columns = [re.sub(r'^_', '', col) for col in col_list]
tech_df = tech_df[['facility_name', 'facility_id_orispl', 'unit_id', 'year', 'fuel_type_primary', 'unit_type']]
tech_df['unit_type'] = tech_df['unit_type'].fillna('UNK')

In [44]:
cems_tech = (
    pd.merge(
        cems_df,
        tech_df,
        left_on=['plant_id_eia', 'unitid', 'year'],
        right_on=['facility_id_orispl', 'unit_id', 'year'],
        how='left'
    ).assign(
        unit_type=lambda x: [re.sub(r' \([A-Za-z , \d]*\)', '', l) for l in x.unit_type.fillna('UNK')]
    )
)

##### Combine CEMS with MUL fraction owned data

In [45]:
# List which plants are not included in the EIA-EPA mapping (and therefore need to be aggregated at the
# plant vs. unit level for CEMS integration)
cems_plant_list = cems_df['plant_id_eia'].unique()
map_plant_list = list(eia_epa.plant_id_eia.unique())
missing_from_map = [plant for plant in cems_plant_list if plant not in map_plant_list]

# Separate into those that can aggregate by unit vs those that must aggregate by plant.
cems_unit = cems_tech[~cems_tech['plant_id_eia'].isin(missing_from_map)].copy()
cems_unit_missing = cems_tech[cems_tech['plant_id_eia'].isin(missing_from_map)].copy()

# Merge with CEMS
cems_mul_unit = pd.merge(cems_unit, util_gen_mul_fracs, on=['plant_id_eia', 'unitid', 'year'], how='left')
cems_mul_missing = (
    pd.merge(cems_unit_missing, util_gen_mul_fracs, on=['plant_id_eia', 'year'], how='left')
    .drop('unitid_y', axis=1)
    .rename(columns={'unitid_x': 'unitid'})
)

##### Calculate the fraction of gross load owned by the utility (either at the unit or plant level)

In [46]:
# For plants where generator level information is available in EPA-EIA conversion
cems_mul_gl_fraction_unit = (
    cems_mul_unit.assign(
        gross_load_mw_fraction_owned_cap=lambda x: x.fraction_owned_cap_unit * x.gross_load_mw,
        plant_id_eia=lambda x: x.plant_id_eia.astype('Int64'),
        fraction_owned_agg_level='unit'
    ).drop_duplicates(subset=['plant_id_eia', 'unitid', 'operating_datetime_utc'])
)

# For plants not available in EPA-EIA conversion 
cems_mul_gl_fraction_plant = (
    cems_mul_missing.assign(
        gross_load_mw_fraction_owned_cap=lambda x: x.fraction_owned_cap_plant * x.gross_load_mw,
        plant_id_eia=lambda x: x.plant_id_eia.astype('Int64'),
        fraction_owned_agg_level='plant'
    ).drop_duplicates(subset=['plant_id_eia', 'unitid', 'operating_datetime_utc'])
)

In [47]:
def backfill_tech_description(ser):
    """Backfill tech description if technology is all the same except for some NA values"""
    ser = ser.replace({None: np.nan})
    types = list(ser.unique())
    if np.nan in types:
        types.remove(np.nan)
        if len(types) == 1:
            ser.values[:] = types[0]
    return ser

In [48]:
# Merge back together
cems_mul_final = (
    pd.concat([
        cems_mul_gl_fraction_unit,
        cems_mul_gl_fraction_plant])
    .dropna(subset=['operating_datetime_utc'])
    .assign(
        fuel_type_code_pudl=lambda x: backfill_tech_description(x.fuel_type_code_pudl),
        fuel_type_primary_x=lambda x: backfill_tech_description(x.fuel_type_primary_x),
        unit_type=lambda x: backfill_tech_description(x.unit_type))
    .rename(columns={'fuel_type_primary_x': 'fuel_type_primary'})
)

cems_mul_final = (
    cems_mul_final[[
       'plant_id_eia', 'unitid', 'gross_load_mw_fraction_owned_cap', 
       'facility_name', 'fuel_type_primary', 'unit_type', 'operating_datetime_utc', 
       'fraction_owned_agg_level'
    ]].copy()
)

In [49]:
# Pivot table so there aren't as many rows
cems_mul_piv = (
    cems_mul_final.pivot(
        columns=['facility_name', 'plant_id_eia', 'fraction_owned_agg_level', 'unitid', 'fuel_type_primary', 'unit_type'],
        index=['operating_datetime_utc'])
    .sort_index(axis=1, level=[0])
)

In [50]:
cems_mul_piv.columns.levels[3]

Index(['plant', 'unit'], dtype='object', name='fraction_owned_agg_level')

In [52]:
cems_mul_piv.to_csv('CEMS_Duke_gross_load.csv')