## Aggregate CEMS data at the utility-plant level for RMI

In [7]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import geopandas as gpd
import geoplot as gplt
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa


# Local libraries
import pudl

In [8]:
pudl_settings = pudl.workspace.setup.get_defaults()
#display(pudl_settings)

ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
#display(ferc1_engine)

pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
#display(pudl_engine)

#pudl_engine.table_names()
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine)

##### Prep FPL Information (Read from MUL)

In [9]:
# Read master unit list
mul = pd.read_pickle('/Users/aesharpe/Desktop/Work/Catalyst_Coop/master_unit_list.pkl.gz')

In [157]:
plant_mul = mul[(mul['plant_part']=='plant_gen') & (mul['ownership']=='owned')].copy()
#t = plant_mul[plant_mul['generator_id'].notnull()]
#t['plant_id_eia'].tolist()
plant_mul[plant_mul['plant_id_eia']==207][['report_date','plant_id_eia','generator_id', 'capacity_mw', 'net_generation_mwh', 'fraction_owned', 'utility_id_eia']]

Unnamed: 0_level_0,report_date,plant_id_eia,generator_id,capacity_mw,net_generation_mwh,fraction_owned,utility_id_eia
record_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
207_1_2009_plant_gen_owned_6452,2009-01-01,207,1,339.5,1914696.0,0.5,6452
207_1_2009_plant_gen_owned_9617,2009-01-01,207,1,339.5,1914696.0,0.5,9617
207_2_2009_plant_gen_owned_6452,2009-01-01,207,2,339.5,2339668.0,0.5,6452
207_2_2009_plant_gen_owned_9617,2009-01-01,207,2,339.5,2339668.0,0.5,9617
207_1_2010_plant_gen_owned_6452,2010-01-01,207,1,,0.0,,6452
207_1_2010_plant_gen_owned_9617,2010-01-01,207,1,,0.0,,9617
207_2_2010_plant_gen_owned_6452,2010-01-01,207,2,,0.0,,6452
207_2_2010_plant_gen_owned_9617,2010-01-01,207,2,,0.0,,9617
207_1_2011_plant_gen_owned_6452,2011-01-01,207,1,339.5,1699839.5,0.5,6452
207_1_2011_plant_gen_owned_9617,2011-01-01,207,1,339.5,1699839.5,0.5,9617


In [138]:
# Get the ownership fractions at the plant level
plant_mul = mul[(mul['plant_part']=='plant_gen') & (mul['ownership']=='owned')].copy()
plant_mul = (
    plant_mul.rename(columns={'report_year': 'year'})
    .reset_index()
    [['plant_id_eia', 'generator_id', 'plant_name_eia', 'year', 'fraction_owned', 'utility_id_eia', 'net_generation_mwh', 'capacity_mw']]
    .drop_duplicates()
)

In [139]:
plant_mul['net_gen_plant_sum'] = (
    plant_mul.groupby(['utility_id_eia', 'plant_id_eia', 'year'])['net_generation_mwh'].transform('sum')
)

plant_mul['cap_plant_sum'] = (
    plant_mul.groupby(['utility_id_eia', 'plant_id_eia', 'year'])['capacity_mw'].transform('sum')
)

In [140]:
plant_mul_FO_NA = plant_mul[plant_mul['fraction_owned'].isna()].copy()
plant_mul_FO = plant_mul[plant_mul['fraction_owned'].notna()].copy()

plant_mul_FO['fraction_owned_gen'] = (
    plant_mul_FO.net_generation_mwh * plant_mul_FO.fraction_owned / plant_mul_FO.net_gen_plant_sum
)
plant_mul_FO['fraction_owned_gen'] = plant_mul_FO['fraction_owned_gen'].fillna(0)

plant_mul = plant_mul_FO.append(plant_mul_FO_NA)

plant_mul['fraction_owned_cap'] = (
    plant_mul.capacity_mw * plant_mul.fraction_owned / plant_mul.cap_plant_sum
)

In [141]:
# only keeps ones in that utility
fpl_eia_id = 6452
fpl_gen_df = plant_mul[plant_mul['utility_id_eia']==fpl_eia_id]
fpl_plants = list(set((fpl_df.plant_id_eia)))

In [143]:
fpl_plant_df = (
    fpl_gen_df.groupby(['plant_id_eia', 'year'])
    .agg({'utility_id_eia': 'first',
          'plant_name_eia': 'first',
          'fraction_owned_gen': 'sum',
          'fraction_owned_cap': 'sum',
          'net_generation_mwh': 'sum'})
    .reset_index()
)

##### Prep CEMS for FPL integration

In [144]:
# CEMS
years = range(2009, 2020)
cems_df = pd.DataFrame()
client = Client()

my_cols = [
    'state',
    'plant_id_eia', 
    'unitid',
    'gross_load_mw',
    'operating_datetime_utc',
]

for yr in years:
    print(f'starting calculation for {yr}')
    epacems_path = (pudl_settings['parquet_dir'] + f'/epacems/year={yr}')
    cems_dd = (
        dd.read_parquet(epacems_path, columns=my_cols)
        .assign(state=lambda x: x['state'].astype('string'))
    )
    cems_dd_fpl = cems_dd[cems_dd['plant_id_eia'].isin(fpl_plants)]
    
    cems_df_fpl = (
        client.compute(cems_dd_fpl)
        .result()
        .assign(year=yr))
    cems_df = pd.concat([cems_df, cems_df_fpl])

Perhaps you already have a cluster running?
Hosting the HTTP server on port 60102 instead


starting calculation for 2009
starting calculation for 2010
starting calculation for 2011
starting calculation for 2012
starting calculation for 2013
starting calculation for 2014
starting calculation for 2015
starting calculation for 2016
starting calculation for 2017
starting calculation for 2018
starting calculation for 2019


In [145]:
# groupby time and plant
cems_plant_df = (
    cems_df.groupby(['plant_id_eia', 'operating_datetime_utc']).agg({
        'state': 'first',
        'year': 'first',
        'gross_load_mw': 'sum',
    }).reset_index()
)

##### Combine CEMS and MUL on FPL data

In [146]:
cems_fpl = (
    pd.merge(cems_plant_df, fpl_plant_df, on=['plant_id_eia', 'year'], how='left')
)

In [147]:
cems_fpl_utility = (
    cems_fpl.assign(
        gross_load_mw_fraction_owned_gen=lambda x: x.gross_load_mw * x.fraction_owned_gen
    ).rename(columns={'net_generation_mwh': 'net_gen_annual_mwh'})
)

In [158]:
cems_fpl_utility.to_csv('CEMS_FPL_gross_load.csv')