## Aggregate CEMS data at the utility-plant level for RMI

In [1]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import geopandas as gpd
import geoplot as gplt
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa


# Local libraries
import pudl

In [2]:
pudl_settings = pudl.workspace.setup.get_defaults()
#display(pudl_settings)

ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
#display(ferc1_engine)

pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
#display(pudl_engine)

#pudl_engine.table_names()
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine)

In [32]:
fpl_id = 6452
duke_id = 5416

util_id = duke_id

##### Prep Utility Information (Read from MUL)

In [15]:
# Read master unit list
mul = pd.read_pickle('/Users/aesharpe/Desktop/Work/Catalyst_Coop/master_unit_list.pkl.gz')

In [26]:
# group MUL by plant/generator and ownership
plant_mul = mul[(mul['plant_part']=='plant_gen') & (mul['ownership']=='owned')].copy()

In [27]:
# Get the ownership fractions at the plant level
plant_mul = mul[(mul['plant_part']=='plant_gen') & (mul['ownership']=='owned')].copy()
plant_mul = (
    plant_mul.rename(columns={'report_year': 'year'})
    .reset_index()
    [['plant_id_eia', 'generator_id', 'plant_name_eia', 'year', 'fraction_owned', 'utility_id_eia', 'net_generation_mwh', 'capacity_mw']]
    .drop_duplicates()
)

In [28]:
# Calculate the net generation and capacity of each plant by summing the generator-level data.
plant_mul['net_gen_plant_sum'] = (
    plant_mul.groupby(['utility_id_eia', 'plant_id_eia', 'year'])['net_generation_mwh'].transform('sum')
)

plant_mul['cap_plant_sum'] = (
    plant_mul.groupby(['utility_id_eia', 'plant_id_eia', 'year'])['capacity_mw'].transform('sum')
)

In [29]:
# Calculate the net generation and capacity owned by the given utility based on the fraction
# of the plant that they own.
plant_mul_FO_NA = plant_mul[plant_mul['fraction_owned'].isna()].copy()
plant_mul_FO = plant_mul[plant_mul['fraction_owned'].notna()].copy()

plant_mul_FO['fraction_owned_gen'] = (
    plant_mul_FO.net_generation_mwh * plant_mul_FO.fraction_owned / plant_mul_FO.net_gen_plant_sum
)
plant_mul_FO['fraction_owned_gen'] = plant_mul_FO['fraction_owned_gen'].fillna(0)

plant_mul = plant_mul_FO.append(plant_mul_FO_NA)

plant_mul['fraction_owned_cap'] = (
    plant_mul.capacity_mw * plant_mul.fraction_owned / plant_mul.cap_plant_sum
)

In [33]:
# Only keeps entries for the given utility
util_gen_df = plant_mul[plant_mul['utility_id_eia']==util_id]
util_plants = list(set((util_gen_df.plant_id_eia)))

In [34]:
util_plant_df = (
    util_gen_df.groupby(['plant_id_eia', 'year'])
    .agg({'utility_id_eia': 'first',
          'plant_name_eia': 'first',
          'fraction_owned_gen': 'sum',
          'fraction_owned_cap': 'sum',
          'net_generation_mwh': 'sum'})
    .reset_index()
)

##### Prep CEMS for Utility integration

In [36]:
# CEMS
years = range(2009, 2020)
cems_df = pd.DataFrame()
client = Client()

my_cols = [
    'state',
    'plant_id_eia', 
    'unitid',
    'gross_load_mw',
    'operating_datetime_utc',
]

for yr in years:
    print(f'starting calculation for {yr}')
    epacems_path = (pudl_settings['parquet_dir'] + f'/epacems/year={yr}')
    cems_dd = (
        dd.read_parquet(epacems_path, columns=my_cols)
        .assign(state=lambda x: x['state'].astype('string'))
    )
    cems_dd_util = cems_dd[cems_dd['plant_id_eia'].isin(util_plants)]
    
    cems_df_util = (
        client.compute(cems_dd_util)
        .result()
        .assign(year=yr))
    cems_df = pd.concat([cems_df, cems_df_util])

starting calculation for 2009
starting calculation for 2010
starting calculation for 2011
starting calculation for 2012
starting calculation for 2013
starting calculation for 2014
starting calculation for 2015
starting calculation for 2016
starting calculation for 2017
starting calculation for 2018
starting calculation for 2019


In [37]:
# groupby time and plant
cems_plant_df = (
    cems_df.groupby(['plant_id_eia', 'operating_datetime_utc']).agg({
        'state': 'first',
        'year': 'first',
        'gross_load_mw': 'sum',
    }).reset_index()
)

##### Combine CEMS and MUL on Utility data

In [40]:
cems_util = (
    pd.merge(cems_plant_df, util_plant_df, on=['plant_id_eia', 'year'], how='left')
)

In [41]:
cems_util_final = (
    cems_util.assign(
        gross_load_mw_fraction_owned_gen=lambda x: x.gross_load_mw * x.fraction_owned_gen
    ).rename(columns={'net_generation_mwh': 'net_gen_annual_mwh'})
)

In [46]:
cems_util_final.to_csv('CEMS_Duke_gross_load.csv')