In [1]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

In [39]:
# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import os
import requests
import tarfile
import sqlalchemy as sa
from pathlib import Path
import pandas as pd
import plotly.express as px
import numpy as np

import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.load_data as load_data
import src.distribute_eia923 as distribute_eia923

# Specify the year for analysis

In [3]:
year = 2020

# 1. Download data

 - Downloads the pre-cleaned PUDL versions of EIA-923, EIA-860, and EPA CEMS data  
 - Downloads EPA eGRID data  
 - Downloads EIA-930 data  
 - Downloads the EPA Power Sector Data Crosswalk

TODO
- [x] The code for downloading the files could probably be made into functions
- [ ] Investigate other packages besides `requests` that would download these files faster

In [4]:
############### PUDL Database ######################

load_data.download_pudl_data(zenodo_url = 'https://zenodo.org/record/5701406/files/pudl-v0.5.0-2021-11-14.tgz')

################# eGRID data #########################

# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = ['https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx', 
                           'https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx']

load_data.download_egrid_files(egrid_files_to_download)

############# EIA-930 data #####################

load_data.download_eia930_data(years_to_download=[year])

########## Power Sector Data Crosswalk #############
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk

load_data.download_epa_psdc(psdc_url='https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv')


PUDL data already downloaded
egrid2019_data.xlsx already downloaded
egrid2020_data.xlsx already downloaded
2020_Jan_Jun data already downloaded
2020_Jul_Dec data already downloaded
epa_eia_crosswalk.csv already downloaded


# Load emissions data reported to CEMS
There are three broad categories of plants based on their CAMD reporting status:
1. Units that report to CAMD year-round (for these plants, emissions data is used directly from CEMS)
2. Units that only report to CAMD during the ozone season (May-Sept) (for these units, non-ozone season data is taken from EIA 923)
3. Units that do not report to CAMD (generally fossil units < 25MW and non-fossil generators)

There are also certain plants that report to CAMD but do not procuce electricty for the grid, and need to be removed from the CEMS data:
- Non grid connected plants
- Steam-only plants


In [None]:
# NOTE: all of the functions in this section could be run by calling clean_cems()
#cems = data_cleaning.clean_cems(year)

In [5]:
# load the CEMS data
cems = load_data.load_cems_data(year)

# remove non-grid connected plants
cems = data_cleaning.remove_non_grid_connected_plants(cems)

# remove plants that only report steam generation and no electrical generation
cems = data_cleaning.remove_heating_only_plants(cems)

# add a report date
cems = data_cleaning.add_report_date(cems)

# identify cems reporting status
cems = data_cleaning.determine_cems_reporting_status(cems)

# TODO: identify and remove any hourly values that appear to be outliers


# fill in missing hourly emissions data using the fuel type and heat input
cems = data_cleaning.fill_cems_missing_co2(cems, year)

In [6]:
# identify any remaining missing values
# TODO: Try to identify fuel types
# NOTE: plant 880109 appears to be a paper mill in Ohio, so should maybe be added to non-grid connected

units_with_no_fuel_type = list(cems[cems['co2_mass_tons'].isnull()]['cems_id'].unique())
print(f"Unable to find fuel types for the following plants_units: {units_with_no_fuel_type}")

cems[cems['co2_mass_tons'].isnull()]

Unable to find fuel types for the following plants_units: ['1004_CTG1', '880109_B001']


Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,co2_mass_measurement_code,plant_id_epa,unit_id_epa,report_date,cems_reporting_category,energy_source_code
10905359,1004,CTG1,1004_CTG1,2020-04-10 04:00:00+00:00,0.03,0.0,0.0,,0.000900,,Measured,1004,90673,2020-04-01,full_year,SGC
11497174,1004,CTG1,1004_CTG1,2020-11-02 03:00:00+00:00,0.03,0.0,0.0,,0.000900,,Measured,1004,90673,2020-11-01,full_year,SGC
23945496,880109,B001,880109_B001,2020-05-01 05:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-05-01,partial_year,
23945497,880109,B001,880109_B001,2020-05-01 06:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-05-01,partial_year,
23945498,880109,B001,880109_B001,2020-05-01 07:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-05-01,partial_year,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24290251,880109,B001,880109_B001,2020-10-01 00:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year,
24290252,880109,B001,880109_B001,2020-10-01 01:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year,
24290253,880109,B001,880109_B001,2020-10-01 02:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year,
24290254,880109,B001,880109_B001,2020-10-01 03:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year,


In [7]:
# For now, lets drop these from the data
cems = cems[~cems['cems_id'].isin(units_with_no_fuel_type)]

In [8]:
# remove any observations from cems where zero operation is reported for an entire month
# although this data could be considered to be accurately reported, let's remove it so that we can double check against the eia data
# TODO: check if any of these observations are from geothermal generators
cems = data_cleaning.remove_cems_with_zero_monthly_emissions(cems)

removing 7002048 observations from cems


In [9]:
# add information about the balancing authority 
#cems = cems.drop(columns=['ba_code','state'])
cems = data_cleaning.assign_ba_code_to_plant(cems, year)

## Explore outlier detection
We need to come up with a method that filters out observations that are significantly higher than normal operation. The challenge is that some plants only operate a handful of hours each year, so their operation looks spikey, which would be identified as an outlier using typical detection methods. 

In [15]:
max = cems.replace(0, np.NaN).groupby(['cems_id'])['heat_content_mmbtu'].max()
mean = cems.replace(0, np.NaN).groupby(['cems_id'])['heat_content_mmbtu'].mean()
stdev = cems.replace(0, np.NaN).groupby(['cems_id'])['heat_content_mmbtu'].std()

max[max > mean + (3 * stdev)]

In [None]:
px.line(cems[cems['cems_id'] == '10298_CG803'], x='operating_datetime_utc', y='heat_content_mmbtu')

In [None]:
px.line(cems[cems['cems_id'] == '1012_2'], x='operating_datetime_utc', y='heat_content_mmbtu')

# 2. Get monthly data for all plants/units where data is missing from CEMS
We have now identified all plants that report the full year to CEMS, and all plants that report a partial year. We will now use the EIA-923 data to fill in the missing pieces.

1. Load EIA-923 data, and standardize heat input and generation data across the tables
2. Identify all plants/months for which we do not have CEMS data

We need to be able to match the EIA data to the CEMS data based on units so we know which data will be used to fill the missing data


In [40]:
# Distribute net generation and heat input data reported by the three different EIA-923 tables
# NOTE: this code was copied and modified from `pudl.analysis.allocate_net_gen`
# NOTE: this code allocates net generation based on the proportion of net generation reported, rather than by nameplate capacity (which eGRID does)
# NOTE: the code was modified to perform the allocation on a monthly basis, rather than an annual basis

# HIGH PRIORITIES
# DONE: remove non-grid connected plants from this dataframe
# DONE: Denormalize data by balancing authority/state. BA assignment from EIA-860
# TODO: look into whether net_gen from generation table should be preserved if available
# TODO: figure out what happens when each table has different values for net gen (allocate difference, or take precedence)
# DONE: add nuclear generators to this list

# FUEL ASSIGNMENT
# DONE: calculate total emissions from gf based on fuel and heat input and distribute in addition to net generation and fuel consumed
# TODO: when aggregating back to generator records, keep the fuel type that accounted for most heat input
# TODO: denormalize data by fuel type both primary fuel type by generator, and primary fuel by plant (assuming that's how reported to ISOs)
# primary fuel type is currently assigned based on the annual primary fuel type. This should be changed to assign base on monthly fuel type

# NEXT PRIORITIES
# TODO: allocate heat input data from boiler_fuel_eia923() See: https://github.com/catalyst-cooperative/pudl/pull/1096
# TODO: Also distribute heat input for electricity consumption

# LOWER PRIORITIES
# TODO: fix allocation of net generation when reported net generation is negative?
# TODO: investigate generators for which frac column is not adding to 1.0

gen_fuel_allocated = distribute_eia923.allocate_gen_fuel_by_gen(year=year)

# flag any generator-months for which we already have cems data
gen_fuel_allocated = data_cleaning.identify_emissions_data_source(cems, gen_fuel_allocated)

# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_distribute = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'eia_only') & ~(gen_fuel_allocated['fuel_consumed_mmbtu'].isna())]

gen_fuel_allocated.sample(10)

        plant_id_eia prime_mover_code energy_source_code report_date  frac  \
3413             377               CA                 NG  2020-04-01   2.0   
3414             377               CT                 NG  2020-04-01   2.0   
3441             377               CA                 NG  2020-11-01   2.0   
3442             377               CT                 NG  2020-11-01   2.0   
3445             377               CA                 NG  2020-12-01   2.0   
...              ...              ...                ...         ...   ...   
104483         58207               CA                 NG  2020-03-01   4.0   
104487         58207               CA                 NG  2020-04-01   4.0   
104495         58207               CA                 NG  2020-06-01   4.0   
104515         58207               CA                 NG  2020-11-01   4.0   
104519         58207               CA                 NG  2020-12-01   4.0   

        net_generation_mwh_g_tbl  frac_fuel  net_generation_mwh

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons,ba_code,state,energy_source_code_1,data_source
144060,55053,655,2020-07-01,387.725462,3950.923077,230.891945,ERCO,TX,NG,eia_only
11122,539,2,2020-09-01,26.532258,0.0,0.0,ISNE,CT,WAT,eia_only
72954,4080,3,2020-06-01,2668.15775,23407.75,0.0,MISO,WI,WAT,eia_only
63515,3456,5CT1,2020-11-01,41018.0,518951.556645,30327.52897,EPE,TX,NG,cems
50197,2607,1,2020-05-01,696.639,6112.0,0.0,NYIS,NY,WAT,eia_only
273881,63580,DD300,2020-09-01,,,,NYIS,NY,DFO,eia_only
111263,10694,8281,2020-02-01,232.363636,2038.545455,0.0,ISNE,MA,WAT,eia_only
181631,56948,4,2020-07-01,4597.916667,46067.916667,2692.20905,SRP,AZ,NG,cems
180214,56880,CS12,2020-06-01,240.5,2110.0,0.0,SWPP,MT,WH,eia_only
54277,2914,3,2020-01-01,0.0,0.0,0.0,PJM,OH,BIT,eia_only


In [41]:
# for which plants are we still missing co2 data?
gen_fuel_allocated[(gen_fuel_allocated['co2_mass_tons'].isna()) & (gen_fuel_allocated['fuel_consumed_mmbtu'] > 0)]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons,ba_code,state,energy_source_code_1,data_source
122458,50626,GEN1,2020-01-01,742.746,381046.0,,MISO,LA,OTH,eia_only
122459,50626,GEN1,2020-02-01,700.137,359168.0,,MISO,LA,OTH,eia_only
122460,50626,GEN1,2020-03-01,654.929,335974.0,,MISO,LA,OTH,eia_only
122461,50626,GEN1,2020-04-01,628.143,322234.0,,MISO,LA,OTH,eia_only
122462,50626,GEN1,2020-05-01,562.301,288461.0,,MISO,LA,OTH,eia_only
122463,50626,GEN1,2020-06-01,626.327,321326.0,,MISO,LA,OTH,eia_only
122464,50626,GEN1,2020-07-01,702.505,360388.0,,MISO,LA,OTH,eia_only
122465,50626,GEN1,2020-08-01,836.592,429186.0,,MISO,LA,OTH,eia_only
122466,50626,GEN1,2020-09-01,730.017,374505.0,,MISO,LA,OTH,eia_only
122467,50626,GEN1,2020-10-01,829.739,425649.0,,MISO,LA,OTH,eia_only


In [52]:
# investigate plants that don't have fuel codes
plants = load_data.load_pudl_table("plants_entity_eia")
gf = load_data.load_pudl_table("generation_fuel_eia923", year=year)
#plants[plants['plant_id_eia'] == 50626]
#gf[gf['plant_id_eia'] == 50626
# look at plants that are refineries
#gf[gf['plant_id_eia'].isin(list(plants.fillna('').loc[plants.fillna('')['plant_name_eia'].str.contains('refin', case=False)]['plant_id_eia']))]

In [42]:
# what percent of generators are in CEMS vs not
gen_fuel_allocated.groupby('data_source').sum() / gen_fuel_allocated.groupby('data_source').sum().sum(axis=0)

Unnamed: 0_level_0,plant_id_eia,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons
data_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cems,0.108901,0.546827,0.501677,0.79872
eia_only,0.891099,0.453173,0.498323,0.20128


## Calculate CEMS net generation
Now that we have accurate net generation data from EIA, we can use this to calculate a net generation ratio to convert the CEMS gross generation to hourly net generation

For now, we will calculate the allocation at the plant level. However, in the future, we may want to calculate for each EPA unit, which will require developing a scheme for allocating each EPA unit to EIA generator

In [66]:
cems = data_cleaning.convert_gross_to_net_generation(cems, gen_fuel_allocated)

  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - self.ssr/self.centered_tss
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature

# Adjust emissions
We next need to make certain adjustments to the data:
 - [ ] Calculate emissions for Geothermal plants
 - [ ] Adjust heat input/emissions from CHP plants by proportion used for electric generation
 - EPA adjusts biomass emissions, but not sure if we want to do that. Need to look into it more

# Output CEMS data

In [None]:
# output the cems data to csv for others to use (update the date)
cems_for_export = cems.loc[cems['co2_mass_tons'] > 0, ['plant_id_eia', 'unitid', 'operating_datetime_utc',
       'gross_generation_mwh','net_generation_mwh','steam_load_1000_lbs', 'heat_content_mmbtu', 'co2_mass_tons',
       'report_date', 'cems_reporting_category', 'energy_source_code',
       'ba_code', 'state']]

date = '20220415'
cems_for_export.to_csv(f'../data/output/cems_{year}_cleaned_{date}.csv')

# Compare results to eGRID totals

Before we allocate the data to the hourly level, we should double check that the total annual emissions / generation values match the "official" data published in eGRID at the annual level.

In [68]:
# Aggregate total calculated values
###################################

# combine cems and eia data
cems_plant_annual = cems.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','heat_content_mmbtu','co2_mass_tons']].rename(columns={'heat_content_mmbtu':'heat_input_mmbtu'}).reset_index()
eia_plant_annual = monthly_eia_data_to_distribute.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons']].rename(columns={'fuel_consumed_mmbtu':'heat_input_mmbtu'}).reset_index()

plant_annual_total = pd.concat([cems_plant_annual,eia_plant_annual], axis=0)

# group any plants that have records from both datasets
plant_annual_total = plant_annual_total.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum().reset_index()

# Load the eGRID plant table
############################

# load plant level data from egrid
egrid_plant = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', 
                            sheet_name=f'PLNT{str(year)[-2:]}', 
                            header=1, 
                            usecols=['BACODE','PSTATABB', 'ORISPL', 'PNAME','PLGENATN', 'PLGENATR', 'PLHTIANT','UNCO2','PLCO2AN'])
# calculate total net generation
egrid_plant['net_generation_mwh'] = egrid_plant['PLGENATN'] + egrid_plant['PLGENATR']
egrid_plant = egrid_plant.drop(columns=['PLGENATN', 'PLGENATR'])
# rename the columns
egrid_plant = egrid_plant.rename(columns={'BACODE':'ba_code',
                                          'PSTATABB':'state',
                                          'ORISPL':'plant_id_eia',
                                          'PNAME':'plant_name',
                                          'PLHTIANT':'heat_input_mmbtu',
                                          'UNCO2':'co2_mass_tons_unadjusted',
                                          'PLCO2AN':'co2_mass_tons'})
# reorder the columns                                         
egrid_plant = egrid_plant[['ba_code', 'state', 'plant_id_eia', 'plant_name', 'net_generation_mwh', 'heat_input_mmbtu', 'co2_mass_tons', 'co2_mass_tons_unadjusted']]

## Identify plants that are missing from each dataset

In [69]:
# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid
plants_not_in_total = list(set(egrid_plant['plant_id_eia'].unique()) - set(plant_annual_total['plant_id_eia'].unique()))
plants_not_in_egrid = list(set(plant_annual_total['plant_id_eia'].unique()) - set(egrid_plant['plant_id_eia'].unique()))

# Which plants are included in eGRID but are missing from our calculations?
missing_from_calc = egrid_plant[egrid_plant['plant_id_eia'].isin(plants_not_in_total)]
missing_from_calc

Unnamed: 0,ba_code,state,plant_id_eia,plant_name,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted
0,,AK,60814,7-Mile Ridge Wind Project,,,,
1,,AK,54452,Agrium Kenai Nitrogen Operations,,,,
18,,AK,93,Blue Lake Hydro,,,,
49,,AK,313,Green Lake,,,,
53,,AK,59037,Hiilangaay Hydro,,,,
...,...,...,...,...,...,...,...,...
12642,WAUW,WY,674,Pilot Butte,,,,
12644,WACM,WY,64847,Rail Tie Wind,,,,
12659,PACE,WY,62516,TB Flats,,,,
12662,PACE,WY,63972,Two Rivers Wind Facility,,,,


In [75]:
# of these plants that are missing from our data, how many of them have non-zero data in eGRID?
missing_from_calc[missing_from_calc[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']].sum(axis=1) > 0].to_csv('../data/temp/plants_missing_from_calcs.csv')
missing_from_calc[missing_from_calc[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']].sum(axis=1) > 0]

Unnamed: 0,ba_code,state,plant_id_eia,plant_name,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted
224,SOCO,AL,50359,Sloss Industries Corp,0.0,0.000,0.000,96648.513
710,CISO,CA,302,Cabrillo Power I Encina Power Station,388042.0,4474181.000,265896.801,265896.801
820,CISO,CA,57807,Coca Cola American Canyon,483.0,4187.000,,
976,CISO,CA,57301,Dutch Wind Energy,217.0,1904.000,,
1002,CISO,CA,330,El Segundo,395756.0,3540209.750,210389.805,210389.805
...,...,...,...,...,...,...,...,...
12263,BPAT,WA,50231,SDS Lumber Gorge Energy Division,0.0,0.000,0.000,32925.781
12346,MISO,WI,59559,Clean Fuel Dane Community Digester,3892.0,21531.287,0.000,2384.798
12536,MISO,WI,56266,WPPI Hartford DG,3.0,30.000,2.450,2.450
12568,PJM,WV,10743,Morgantown Energy Facility,0.0,0.000,0.000,43514.354


In [76]:
# Which plants are in our calculations, but are missing from eGRID?
plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]
missing_from_egrid = plant_annual_total[plant_annual_total['plant_id_eia'].isin(plants_not_in_egrid)].merge(plant_names, how='left', on='plant_id_eia')
missing_from_egrid

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_name_eia,sector_name_eia
0,CISO,CA,55874,3826.0,58814.0,3437.09,Panoche Peaker,IPP Non-CHP
1,CISO,CA,57901,395756.0,6857176.0,407512.7,El Segundo Energy Center LLC,IPP Non-CHP
2,CISO,CA,59002,388042.0,4274786.0,254047.0,Carlsbad Energy Center,IPP Non-CHP
3,CPLE,NC,7538,175398.0,2448530.0,146769.8,Wayne County,Electric Utility
4,CPLE,NC,58697,4078353.0,57400300.0,3411234.0,L V Sutton Combined Cycle,Electric Utility
5,DUK,NC,58215,5654012.0,81298870.0,4831487.0,Lee Combined Cycle Plant,Electric Utility
6,ERCO,TX,7512,3187965.0,44872420.0,2666693.0,A Von Rosenberg,Electric Utility
7,ERCO,TX,55545,2529351.0,35715020.0,2122485.0,Hidalgo Energy Center,IPP Non-CHP
8,FPC,FL,7294,0.0,5400.395,321.1518,Central Energy Plant,Electric Utility
9,ISNE,CT,57068,693.0,55754.36,4412.329,GenConn Middletown LLC,IPP Non-CHP


In [77]:
# for some reason, egrid removes certain plants that are found in the ORIS crosswalk, and then never adds them back in. 
# It seems that these plants should be kept, and this might be a mistake in eGRID
plants_removed_from_egrid = pd.read_csv('../data/egrid/egrid_static_tables/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv')
plants_removed_from_egrid = plants_removed_from_egrid.rename(columns={'EIA ORISPL ID':'plant_id_eia','EIA Plant Name':'plant_name_eia','EPA/CAMD ORISPL ID':'plant_id_epa','EPA/CAMD Plant Name':'plant_name_epa'})

# let's re-examine this list to see which plants are missing that are not in this list
missing_from_egrid[~missing_from_egrid['plant_id_eia'].isin(list(plants_removed_from_egrid['plant_id_eia'].unique()))]

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_name_eia,sector_name_eia
2,CISO,CA,59002,388042.0,4274786.0,254047.048135,Carlsbad Energy Center,IPP Non-CHP
3,CPLE,NC,7538,175398.000462,2448530.0,146769.806827,Wayne County,Electric Utility
25,,,55248,700265.0625,8503199.0,501661.66297,Tait,IPP Non-CHP
26,,,880075,0.0,1026400.0,59982.816,BP Amoco Chemical Company,
27,,,880079,329069.71875,4952602.0,289430.053835,Tate & Lyle-Loudon,


In [96]:
# what are the plants that egrid removed?
missing_from_egrid[missing_from_egrid['plant_id_eia'].isin(list(plants_removed_from_egrid['plant_id_eia'].unique()))]

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_name_eia,sector_name_eia
0,CISO,CA,55874,3826.0,58814.0,3437.09,Panoche Peaker,IPP Non-CHP
1,CISO,CA,57901,395756.0,6857176.0,407512.7,El Segundo Energy Center LLC,IPP Non-CHP
4,CPLE,NC,58697,4078353.0,57400300.0,3411234.0,L V Sutton Combined Cycle,Electric Utility
5,DUK,NC,58215,5654012.0,81298870.0,4831487.0,Lee Combined Cycle Plant,Electric Utility
6,ERCO,TX,7512,3187965.0,44872420.0,2666693.0,A Von Rosenberg,Electric Utility
7,ERCO,TX,55545,2529351.0,35715020.0,2122485.0,Hidalgo Energy Center,IPP Non-CHP
8,FPC,FL,7294,0.0,5400.395,321.1518,Central Energy Plant,Electric Utility
9,ISNE,CT,57068,693.0,55754.36,4412.329,GenConn Middletown LLC,IPP Non-CHP
10,MISO,IN,10397,194744.0,8026617.0,1524149.0,Indiana Harbor,Industrial CHP
11,MISO,IN,54995,475667.0,11096500.0,3074503.0,5 AC Station,Industrial CHP


## Identify plants for which we are missing a BA assignment
(of the plants not already missing from our calculated totals)

In [78]:
ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(plant_annual_total.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

# plants with missing ba code
ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
57698,Aerojet II,BANC,
55864,Kaheka Hydro,HECO,
58277,Paia Hydroelectric Plant,HECO,
7966,Iowa Distributed Wind Generation Project,SWPP,


## Identify plants for which we have incorrectly assigned the BA code

In [79]:
# plants with incorrect ba code
ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_calc'].isna())]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
55306,Gila River Power Station,TEPC,SRP
10093,Tesoro Hawaii,,HECO


## Identify where our calculated totals do not match eGRID's totals

In [99]:
# standardize column names and index so that the two dfs can be divided
calculated_to_compare = plant_annual_total.set_index('plant_id_eia').drop(columns=['ba_code','state'])
calculated_to_compare['co2_mass_tons_unadjusted'] = calculated_to_compare['co2_mass_tons']
egrid_to_compare = egrid_plant.set_index(['plant_id_eia']).drop(columns=['ba_code','state','plant_name'])

# divide calculated value by egrid value
compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_eia','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_eia').set_index('plant_id_eia')

# for each column, categorize the data based on how far it is off from egrid
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']:
    # add a new column
    compared[f'{col}_status'] = pd.cut(x=compared[col], 
                                       bins=[-999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999], 
                                       labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], 
                                       ordered=False)
    compared[f'{col}_status'] = compared[f'{col}_status'].astype(str)  

# create a dataframe that counts how many plants are in each category
comparison_count = []
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']:
    count = compared.groupby(f'{col}_status', dropna=False).count()[col]
    count.index = count.index.rename('status')
    comparison_count.append(count)

comparison_count = pd.concat(comparison_count, axis=1)
comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0)   
comparison_count

Unnamed: 0,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted
!exact,9770,7936,812,1631
+/-1%,28,544,595,729
+/-10%,62,318,399,474
+/-50%,41,278,323,304
<50%,18,14,11,19
>50%,50,699,767,134
,0,52,387,3
negative,8,1,4,5
Total,9977,9842,3298,3299


In [100]:
comparison_count.to_markdown(index=False)

'|   net_generation_mwh |   heat_input_mmbtu |   co2_mass_tons |   co2_mass_tons_unadjusted |\n|---------------------:|-------------------:|----------------:|---------------------------:|\n|                 9770 |               7936 |             812 |                       1631 |\n|                   28 |                544 |             595 |                        729 |\n|                   62 |                318 |             399 |                        474 |\n|                   41 |                278 |             323 |                        304 |\n|                   18 |                 14 |              11 |                         19 |\n|                   50 |                699 |             767 |                        134 |\n|                    0 |                 52 |             387 |                          3 |\n|                    8 |                  1 |               4 |                          5 |\n|                 9977 |               9842 |            32

In [106]:
# examine specific plants in a category
value = 'co2_mass_tons_unadjusted'
status = '>50%'

compared[compared[f'{value}_status'] == status].sort_values(by=value)

Unnamed: 0_level_0,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted,plant_name,ba_code,state,net_generation_mwh_status,heat_input_mmbtu_status,co2_mass_tons_status,co2_mass_tons_unadjusted_status
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3116,1.000000,1.502514,1.502461,1.502461,Tolna,PJM,PA,!exact,>50%,>50%,>50%
117,1.000000,1.514877,1.506294,1.506294,APS West Phoenix Power Plant,AZPS,AZ,!exact,>50%,>50%,>50%
126,1.000000,1.574499,1.565482,1.516103,Irvington Generating Station,TEPC,AZ,!exact,>50%,>50%,>50%
55222,1.000000,1.526956,1.516182,1.516182,Lincoln Generating Facility,PJM,IL,!exact,>50%,>50%,>50%
55202,0.966059,1.528019,1.518872,1.518872,Pinckneyville Power Plant,MISO,IL,+/-10%,>50%,>50%,>50%
...,...,...,...,...,...,...,...,...,...,...,...
50949,1.000000,48.709135,47.917695,47.917695,Hardee Power Station,TEC,FL,!exact,>50%,>50%,>50%
50410,1.000000,217.435088,217.418090,89.450033,Kimberly-Clark Tissue Company,PJM,PA,!exact,>50%,>50%,>50%
673,1.000000,108.616745,108.041791,108.041791,Tom G Smith,FMPP,FL,!exact,>50%,>50%,>50%
2504,1.000000,242.231355,174.814742,174.814742,74th Street,NYIS,NY,!exact,>50%,>50%,>50%


In [None]:
# TODO: Compare the fuel input from CEMS to the input from EIA to see if they are close
# in general, we will trust the CEMS data over the EIA data unless there are significant differences
# may need to aggregate to plant level since there is not a 1:1 match between units and generators

# for plants where there is data reported in cems, see how off it is from data reported in eia
cems_plant_monthly = cems.groupby(['plant_id_eia','report_date']).sum()[['heat_content_mmbtu']].reset_index()
gf_plant_monthly = gen_fuel_allocated.groupby(['plant_id_eia','report_date']).sum().reset_index()
gf_plant_monthly = gf_plant_monthly.merge(cems_plant_monthly, how='left', on=['plant_id_eia','report_date'])
gf_plant_monthly = gf_plant_monthly[gf_plant_monthly['heat_content_mmbtu'].notnull()]
gf_plant_monthly['pct_diff'] = (gf_plant_monthly['heat_content_mmbtu'] - gf_plant_monthly['fuel_consumed_mmbtu']) / gf_plant_monthly['fuel_consumed_mmbtu']

# identify where there are plants that report 0 heat input to cems but have data in eia_923
gf_plant_monthly[(gf_plant_monthly['heat_content_mmbtu'] == 0) & (gf_plant_monthly['fuel_consumed_mmbtu'] > 0)]

## Compare data at BA level

In [101]:
# load egrid BA totals
egrid_ba = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', sheet_name=f'BA{str(year)[-2:]}', header=1, usecols=['BANAME','BACODE','BAHTIANT','BANGENAN','BACO2AN'])
# rename the columns
egrid_ba = egrid_ba.rename(columns={'BANAME':'ba_name',
                                    'BACODE':'ba_code',
                                    'BAHTIANT':'heat_input_mmbtu',
                                    'BANGENAN':'net_generation_mwh',
                                    'BACO2AN':'co2_mass_tons'})

ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').div(egrid_ba.set_index('ba_code').drop(columns='ba_name')).sort_values(by='co2_mass_tons').round(3)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ba_metric)

Unnamed: 0_level_0,co2_mass_tons,heat_input_mmbtu,net_generation_mwh
ba_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.302,0.356,0.316
TEPC,0.859,0.818,0.697
DEAA,0.997,0.997,1.0
GRIF,0.999,0.999,1.0
OVEC,1.0,1.0,1.0
CSTO,1.0,1.0,1.0
NSB,1.0,1.0,1.0
SPA,1.0,1.0,1.0
CEA,1.0,1.0,1.0
LGEE,1.001,1.001,1.0


In [102]:
ba_metric.to_markdown(index=False)

'|   co2_mass_tons |   heat_input_mmbtu |   net_generation_mwh |\n|----------------:|-------------------:|---------------------:|\n|           0.302 |              0.356 |                0.316 |\n|           0.859 |              0.818 |                0.697 |\n|           0.997 |              0.997 |                1     |\n|           0.999 |              0.999 |                1     |\n|           1     |              1     |                1     |\n|           1     |              1     |                1     |\n|           1     |              1     |                1     |\n|           1     |              1     |                1     |\n|           1     |              1     |                1     |\n|           1.001 |              1.001 |                1     |\n|           1.001 |              0.999 |                1     |\n|           1.001 |              1.001 |                1     |\n|           1.002 |              1.004 |                1     |\n|           1.003 |     

# Assign monthly data to hourly profile
We now, in theory, have complete data on national-level heat input, net generation, and emissions, from a combination of two sources:
    1. hourly data from CEMS
    2. momthly data for generators that don't report to CEMS

For the second category of monthly data, we need to figure out how to allocate the monthly level data to each hour. 

In [None]:
# categorize generators by broad fuel categories (clean, geothermal, biofuel, fossil)
clean_fuels = ['SUN','MWH','WND', 'WAT','WH','PUR','NUC']
fossil_fuels = ['NG', 'DFO','OG','WDS','BIT','OTH','PC','SUB', 'LIG','KER', 'RC', 'WO','RFO', 'WC', 'SGC', 'SGP', 'PG', 'JF','BFG']
bio_fuels = ['AB','BG','BLQ','DG','LFG','MSB','MSW','OBG','OBL','OBS','SLW','WDL','WDS']
geo_fuel = ['GEO']
# TODO: Figure out what to do with MSW

# Output data 

Save data to a CSV so we can separate generation of and analysis of hourly e-grid numbers