In [1]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import os
import requests
import tarfile
import sqlalchemy as sa
from pathlib import Path
import pandas as pd
import plotly.express as px
import numpy as np
import pudl.analysis.allocate_net_gen as allocate_gen_fuel

import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.load_data as load_data
import src.eia930 as eia930
#import src.distribute_eia923 as distribute_eia923

# Specify the year for analysis

In [2]:
year = 2020

# 1. Download data

 - Downloads the pre-cleaned PUDL versions of EIA-923, EIA-860, and EPA CEMS data  
 - Downloads EPA eGRID data  
 - Downloads EIA-930 data  
 - Downloads the EPA Power Sector Data Crosswalk

TODO
- [x] The code for downloading the files could probably be made into functions
- [ ] Investigate other packages besides `requests` that would download these files faster

In [3]:
############### PUDL Database ######################

load_data.download_pudl_data(zenodo_url = 'https://zenodo.org/record/6349861/files/pudl-v0.6.0-2022-03-12.tgz')

################# eGRID data #########################

# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = ['https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx', 
                           'https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx']

load_data.download_egrid_files(egrid_files_to_download)

############# EIA-930 data #####################

load_data.download_eia930_data(years_to_download=[year])
load_data.download_chalendar_files()

########## Power Sector Data Crosswalk #############
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk

load_data.download_epa_psdc(psdc_url='https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv')


PUDL data already downloaded
egrid2019_data.xlsx already downloaded
egrid2020_data.xlsx already downloaded
2020_Jan_Jun data already downloaded
2020_Jul_Dec data already downloaded
EBA_elec.csv already downloaded
EBA_raw.csv already downloaded
epa_eia_crosswalk.csv already downloaded


# 1. Clean EIA-923 Generation and Fuel Data at the Monthly Level

In [4]:
# Distribute net generation and heat input data reported by the three different EIA-923 tables
# NOTE: this code allocates net generation based on the proportion of net generation reported, rather than by nameplate capacity (which eGRID does)

# TODO: fix allocation of net generation when reported net generation is negative?

pudl_out = load_data.initialize_pudl_out(year=2020)

# allocate net generation and heat input to each generator-fuel grouping
gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source(pudl_out, drop_interim_cols=True)

# create a table that identifies the primary fuel of each generator and plant
primary_fuel_table = data_cleaning.create_primary_fuel_table(gen_fuel_allocated)

# calculate co2 emissions for each generator-fuel based on allocated fuel consumption
gen_fuel_allocated = data_cleaning.calculate_co2_from_fuel_consumption(gen_fuel_allocated, year)

# aggregate the allocated data to the generator level
gen_fuel_allocated = allocate_gen_fuel.agg_by_generator(gen_fuel_allocated, 
                                                        sum_cols=["net_generation_mwh", 
                                                                  "fuel_consumed_mmbtu",
                                                                  "fuel_consumed_for_electricity_mmbtu",
                                                                  "co2_mass_tons",
                                                                  "co2_mass_tons_adjusted"])

# merge the primary fuel information into the allocated data
gen_fuel_allocated = gen_fuel_allocated.merge(primary_fuel_table, how='left', on=['plant_id_eia','generator_id'])

# remove any plants that we don't want in the data
gen_fuel_allocated = data_cleaning.remove_plants(gen_fuel_allocated, 
                                                 non_grid_connected=True, 
                                                 remove_states=['PR'],
                                                 steam_only_plants=False,
                                                 distribution_connected_plants=False)

gen_fuel_allocated = data_cleaning.assign_ba_code_to_plant(gen_fuel_allocated, year)

gen_fuel_allocated

Removing 0 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']


Unnamed: 0,report_date,plant_id_eia,generator_id,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_tons,co2_mass_tons_adjusted,energy_source_code,plant_primary_fuel,ba_code,ba_code_physical,state
0,2020-01-01,1,1,,,,,,DFO,WND,,,AK
1,2020-01-01,1,2,,,,,,DFO,WND,,,AK
2,2020-01-01,1,3,,,,,,DFO,WND,,,AK
3,2020-01-01,1,5,,,,,,DFO,WND,,,AK
4,2020-01-01,1,WT1,15.1585,133.0,133.0,0.0,0.0,WND,WND,,,AK
...,...,...,...,...,...,...,...,...,...,...,...,...,...
286693,2020-12-01,64816,GEN1,,,,,,NG,NG,ERCO,ERCO,TX
286694,2020-12-01,64816,GEN2,,,,,,NG,NG,ERCO,ERCO,TX
286695,2020-12-01,64816,GEN3,,,,,,NG,NG,ERCO,ERCO,TX
286696,2020-12-01,64836,CATAL,,,,,,SUN,SUN,CISO,LDWP,CA


# 2. Clean Hourly Data from CEMS
There are three broad categories of plants based on their CAMD reporting status:
1. Units that report to CAMD year-round (for these plants, emissions data is used directly from CEMS)
2. Units that only report to CAMD during the ozone season (May-Sept) (for these units, non-ozone season data is taken from EIA 923)
3. Units that do not report to CAMD (generally fossil units < 25MW and non-fossil generators)

There are also certain plants that report to CAMD but do not procuce electricty for the grid, and need to be removed from the CEMS data:
- Non grid connected plants
- Steam-only plants


In [None]:
# NOTE: all of the functions in this section could be run by calling clean_cems()
cems = data_cleaning.clean_cems(year)

In [None]:
## DATA CLEANING ##
# identify which units are still missing a fuel type assignment
# NOTE: we will need to fix this before matching to EIA-930 data
# however, we should create a plant_primary_fuel column, which might be different than the unit-specific primary fuel
cems[cems['energy_source_code'].isnull()]['cems_id'].unique()

In [None]:
# flag any generator-months for which we already have cems data
# NOTE: there is still an issue identifying for which generators we have cems data because of incomplete mapping
#gen_fuel_allocated = gen_fuel_allocated.drop(columns=['data_source'])
gen_fuel_allocated = data_cleaning.identify_emissions_data_source(cems, gen_fuel_allocated, year)

# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_distribute = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'eia_only') & ~(gen_fuel_allocated['fuel_consumed_mmbtu'].isna())]

# what percent of generators are in CEMS vs not
(gen_fuel_allocated.groupby('data_source').sum() / gen_fuel_allocated.groupby('data_source').sum().sum(axis=0)).round(3)

In [None]:
# convert hourly gross generation to net generation
#cems = cems.drop(columns=['net_generation_mwh','gross_to_net_ratio','net_gen_method'])
cems = data_cleaning.convert_gross_to_net_generation(cems, gen_fuel_allocated)

# for generators where there is heat input but no gross generation reported, impute hourly net generation based on reported EIA values
# TODO: Maybe remove this 
cems = data_cleaning.impute_missing_hourly_net_generation(cems, gen_fuel_allocated)

# what percent of net generation was allocated using each method?
cems.groupby('net_gen_method', dropna=False).sum()['net_generation_mwh'] / cems['net_generation_mwh'].sum()

In [None]:
# add information that we need to aggregate the data and match to eia930
#cems = cems.drop(columns=['ba_code','ba_code_physical','state'])
#cems = cems.drop(columns=['distribution_flag'])
#cems = cems.drop(columns=['plant_primary_fuel'])

cems = data_cleaning.assign_ba_code_to_plant(cems, year)

# add a flag about whether the plant is distribution connected
cems = data_cleaning.identify_distribution_connected_plants(cems, year, voltage_threshold_kv=60)

# add a plant primary fuel and a fuel category for eia930
cems = cems.merge(primary_fuel_table.drop_duplicates(subset='plant_id_eia')[['plant_id_eia','plant_primary_fuel']], how='left', on='plant_id_eia')
cems = data_cleaning.assign_fuel_category_to_ESC(cems, fuel_category_name='fuel_group_eia930', esc_column='plant_primary_fuel')
cems = cems.rename(columns={'fuel_category':'fuel_category_eia930'})

In [None]:
cems.head(5)

# Assign monthly data to hourly profile
We now, in theory, have complete data on national-level heat input, net generation, and emissions, from a combination of two sources:
    1. hourly data from CEMS
    2. momthly data for generators that don't report to CEMS

For the second category of monthly data, we need to figure out how to allocate the monthly level data to each hour. 

In [None]:
fuel_group_name = 'fuel_group_custom'
energy_source_groups = pd.read_csv('../data/manual/energy_source_groups.csv')[['energy_source_code',fuel_group_name]].rename(columns={fuel_group_name:'fuel_category'})
# assign a fuel category to the monthly eia data
monthly_eia_data_to_distribute = monthly_eia_data_to_distribute.merge(energy_source_groups[['energy_source_code','fuel_category']], how='left', on='energy_source_code')

In [None]:
hourly_profiles = eia930.load_chalendar_for_pipeline(year)

In [None]:
hourly_profiles

In [None]:
# for fuel categories that exist in the EIA data but not in EIA-930, create flat profiles to add to the hourly profiles from 930
ba_list = list(monthly_eia_data_to_distribute['ba_code'].dropna().unique())

# create an hourly datetime series in local time for each ba/fuel type
hourly_profiles_to_add = []

# for each ba
for ba in ba_list:
    # get a list of fuels categories that exist in that BA
    ba_fuel_list = list(monthly_eia_data_to_distribute.loc[monthly_eia_data_to_distribute['ba_code'] == ba,'fuel_category'].unique())
    for fuel in ba_fuel_list:
        # if there is no data for that fuel type in the eia930 data, create a flat profile
        if len(hourly_profiles[(hourly_profiles['ba_code'] == ba) & (hourly_profiles['fuel_category'] == fuel)]) == 0:
            # create a dataframe
            df_temp = pd.DataFrame(index=pd.date_range(start=f'{year-1}-12-31 00:00:00', end=f'{year+1}-01-01 23:00:00', freq='H', tz='UTC', name='datetime_utc'),
                                    columns=['ba_code','fuel_category']).reset_index()                  
            df_temp['ba_code'] = ba
            df_temp['fuel_category'] = fuel
            df_temp['net_generation_mwh_930'] = 1.0
            df_temp['datetime_local'] = df_temp['datetime_utc']
            df_temp['datetime_local'] = df_temp['datetime_utc'].dt.tz_convert(data_cleaning.ba_timezone(ba, 'GMT'))
            # create a report date column
            df_temp['report_date'] = df_temp['datetime_local'].astype(str).str[:7]
            df_temp['report_date'] = pd.to_datetime(df_temp['report_date'])
            hourly_profiles_to_add.append(df_temp)

hourly_profiles_to_add = pd.concat(hourly_profiles_to_add, axis=0, ignore_index=True)

In [None]:
# concat the flat profiles to the hourly profiles
hourly_profiles = pd.concat([hourly_profiles,hourly_profiles_to_add], axis=0)

In [None]:
hourly_eia_data = data_cleaning.distribute_monthly_eia_data_to_hourly(monthly_eia_data_to_distribute, hourly_profiles)

# Output data 

Save data to a CSV so we can separate generation of and analysis of hourly e-grid numbers

In [None]:
# assign a fuel group to each observation in CEMS
cems_ba_fuel = cems.merge(energy_source_groups[['energy_source_code','fuel_category']], how='left', on='energy_source_code')

# rename the datetime_utc column
cems_ba_fuel = cems_ba_fuel.rename(columns={'operating_datetime_utc':'datetime_utc'})

# aggregate cems data by BA and fuel type
cems_ba_fuel = cems_ba_fuel.groupby(['ba_code','fuel_category','datetime_utc']).sum()[['gross_generation_mwh','net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons','co2_mass_tons_adjusted']].reset_index()
cems_ba_fuel['data_source'] = 'CEMS'

In [None]:
combined_data = pd.concat([cems_ba_fuel, hourly_eia_data.drop(columns=['datetime_local','net_generation_mwh_930','report_date'])], axis=0)
# calculate a produced emission rate
combined_data['co2_rate_lb_per_mwh_generated'] = (combined_data['co2_mass_tons'] * 2000 / combined_data['net_generation_mwh']).fillna(0).replace(np.inf, np.NaN)
combined_data

In [None]:
ba = 'CISO'
data = 'net_generation_mwh'

fuel_color = {'natural_gas':'sienna',
              'coal':'black',
              'nuclear':'red',
              'biomass':'green',
              'geothermal':'orange',
              'wind':'blue',
              'solar':'gold',
              'petroleum':'purple',
              'hydro':'skyblue',
              'other':'lightgrey',
              'waste':'pink'}

fuel_order = ['nuclear','geothermal','hydro','other','coal','biomass','petroleum','waste','solar','wind','natural_gas']

data_to_graph = combined_data[combined_data['ba_code'] == ba]
#data_to_graph = data_to_graph.groupby(['datetime_utc','fuel_category','data_source']).sum().reset_index()


px.area(data_to_graph, 
        x='datetime_utc', 
        y=data, 
        color='fuel_category', 
        color_discrete_map=fuel_color, 
        facet_col='data_source', 
        template='plotly_white',
        title=f'Hourly generation data for {ba} by fuel type',
        category_orders={'fuel_category':fuel_order}).update_traces(line={'width':0})

In [None]:
ba = 'CISO'
data = 'co2_mass_tons'

fuel_color = {'natural_gas':'sienna',
              'coal':'black',
              'nuclear':'red',
              'biomass':'green',
              'geothermal':'orange',
              'wind':'blue',
              'solar':'gold',
              'petroleum':'purple',
              'hydro':'skyblue',
              'other':'lightgrey',
              'waste':'pink'}

fuel_order = ['nuclear','geothermal','hydro','other','coal','biomass','natural_gas','petroleum','waste','solar','wind']

data_to_graph = combined_data[combined_data['ba_code'] == ba]
#data_to_graph = data_to_graph.groupby(['datetime_utc','fuel_category','data_source']).sum().reset_index()


px.area(data_to_graph, 
        x='datetime_utc', 
        y=data, 
        color='fuel_category', 
        color_discrete_map=fuel_color, 
        facet_col='data_source', 
        template='plotly_white',
        title=f'Hourly data for {ba} by fuel type',
        category_orders={'fuel_category':fuel_order}).update_traces(line={'width':0})

In [None]:
ba = 'MISO'
data = 'co2_rate_lb_per_mwh_generated'

fuel_color = {'natural_gas':'sienna',
              'coal':'black',
              'nuclear':'red',
              'biomass':'green',
              'geothermal':'orange',
              'wind':'blue',
              'solar':'gold',
              'petroleum':'purple',
              'hydro':'skyblue',
              'other':'lightgrey',
              'waste':'pink'}

fuel_order = ['nuclear','coal','natural_gas','petroleum','other','waste','biomass','geothermal','hydro','solar','wind']

data_to_graph = combined_data[combined_data['ba_code'] == ba]
#data_to_graph = data_to_graph.groupby(['datetime_utc','fuel_category','data_source']).sum().reset_index()


px.line(data_to_graph, 
        x='datetime_utc', 
        y=data, 
        color='fuel_category', 
        color_discrete_map=fuel_color, 
        facet_col='data_source', 
        template='plotly_white',
        title=f'Hourly data for {ba} by fuel type',
        category_orders={'fuel_category':fuel_order})

# Data Quality Metrics

### Compare monthly totals for each plant from each data source

In [None]:
# TODO: Compare the data reported by both sources
# for plants where there is data reported in cems, see how off it is from data reported in eia
cems_plant_monthly = cems.groupby(['plant_id_eia','report_date']).sum()[['gross_generation_mwh','net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons']].reset_index()
gf_plant_monthly = gen_fuel_allocated.groupby(['plant_id_eia','report_date']).sum().reset_index()
gf_plant_monthly = gf_plant_monthly.merge(cems_plant_monthly, how='inner', on=['plant_id_eia','report_date'], suffixes=("_eia",'_cems'))

gf_plant_monthly['pctdiff_gen'] = ((gf_plant_monthly['net_generation_mwh_cems'].replace(0,0.1) - gf_plant_monthly['net_generation_mwh_eia'].replace(0,0.1)) / gf_plant_monthly['net_generation_mwh_eia'].replace(0,0.1)).round(3)
gf_plant_monthly['pctdiff_fuel'] = ((gf_plant_monthly['fuel_consumed_mmbtu_cems'].replace(0,0.1) - gf_plant_monthly['fuel_consumed_mmbtu_eia'].replace(0,0.1)) / gf_plant_monthly['fuel_consumed_mmbtu_eia'].replace(0,0.1)).round(3)
gf_plant_monthly['pctdiff_co2'] = ((gf_plant_monthly['co2_mass_tons_cems'].replace(0,0.1) - gf_plant_monthly['co2_mass_tons_eia'].replace(0,0.1)) / gf_plant_monthly['co2_mass_tons_eia'].replace(0,0.1)).round(3)

gf_plant_monthly.sort_values(by='pctdiff_gen')

In [None]:
# investigate single plants
gf_plant_monthly[gf_plant_monthly['plant_id_eia'] == 55641]

## Compare Our Results to eGRID

In [None]:
# Aggregate total calculated values
###################################

# combine cems and eia data
cems_plant_annual = cems.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons','co2_mass_tons_adjusted']].reset_index()
eia_plant_annual = monthly_eia_data_to_distribute.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_tons','co2_mass_tons_adjusted']].reset_index()

plant_annual_total = pd.concat([cems_plant_annual,eia_plant_annual], axis=0)

# group any plants that have records from both datasets
plant_annual_total = plant_annual_total.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum().reset_index()

# For plants that have different EPA and EIA plant IDs, the plant ID in eGRID is usually the EPA ID, but sometimes the EIA ID
# however, there are sometime 2 EIA IDs for a single eGRID ID, so we need to group the data in the EIA table by the egrid id
# We need to update all of the egrid plant IDs to the EIA plant IDs
egrid_crosswalk = pd.read_csv('../data/egrid/egrid_static_tables/2020/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv')
eia_to_egrid_id = dict(zip(list(egrid_crosswalk['plant_id_eia']), list(egrid_crosswalk['plant_id_egrid'])))
egrid_to_eia_id = dict(zip(list(egrid_crosswalk['plant_id_egrid']), list(egrid_crosswalk['plant_id_eia'])))
plant_annual_total['plant_id_egrid'] = plant_annual_total['plant_id_eia']
plant_annual_total['plant_id_egrid'].update(plant_annual_total['plant_id_egrid'].map(eia_to_egrid_id))

# Load the eGRID plant table
############################

# load plant level data from egrid
egrid_plant = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', 
                            sheet_name=f'PLNT{str(year)[-2:]}', 
                            header=1, 
                            usecols=['BACODE','PSTATABB', 'PLPRMFL','ORISPL', 'PNAME','PLGENATN', 'PLGENATR', 'PLHTIANT','UNCO2','UNHTIT','PLCO2AN'])
# calculate total net generation from reported renewable and nonrenewable generation
egrid_plant['net_generation_mwh'] = egrid_plant['PLGENATN'] + egrid_plant['PLGENATR']
egrid_plant = egrid_plant.drop(columns=['PLGENATN', 'PLGENATR'])
# rename the columns
egrid_plant = egrid_plant.rename(columns={'BACODE':'ba_code',
                                          'PSTATABB':'state',
                                          'PLPRMFL':'energy_source_code',
                                          'ORISPL':'plant_id_egrid',
                                          'PNAME':'plant_name',
                                          'UNHTIT':'fuel_consumed_mmbtu',
                                          'PLHTIANT':'fuel_consumed_for_electricity_mmbtu',
                                          'UNCO2':'co2_mass_tons',
                                          'PLCO2AN':'co2_mass_tons_adjusted'})

# if egrid has a missing value for co2 for a clean plant, replace with zero
clean_fuels = ['SUN','MWH','WND', 'WAT','WH','PUR','NUC']
egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons_adjusted'] = egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons_adjusted'].fillna(0)
egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons'] = egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons'].fillna(0)

# reorder the columns                                         
egrid_plant = egrid_plant[['ba_code', 'state', 'plant_id_egrid', 'plant_name','energy_source_code', 'net_generation_mwh', 'fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu', 'co2_mass_tons', 'co2_mass_tons_adjusted']]

# remove any plants that habe no reported data
# NOTE: it seems that egrid includes a lot of proposed projects that are not yet operating, but just has missing data for them
plants_with_no_data_in_egrid = list(egrid_plant[egrid_plant[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_tons','co2_mass_tons_adjusted']].sum(axis=1) == 0]['plant_id_egrid'])
egrid_plant = egrid_plant[~egrid_plant['plant_id_egrid'].isin(plants_with_no_data_in_egrid)]

# We also want to remove any plants that are located in Puerto Rico
egrid_plant = egrid_plant[(egrid_plant['state'] != 'PR')]

# create a column for eia id
egrid_plant['plant_id_eia'] = egrid_plant['plant_id_egrid']
egrid_plant['plant_id_eia'].update(egrid_plant['plant_id_eia'].map(egrid_to_eia_id))

### Identify plants in eGRID missing from our results

Note: none of these plants have any co2 emissions, so we will not focus on fixing this for now

In [None]:
# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid
plant_not_in_calc = list(set(egrid_plant['plant_id_eia'].unique()) - set(plant_annual_total['plant_id_eia'].unique()))
plants_not_in_egrid = list(set(plant_annual_total['plant_id_egrid'].unique()) - set(egrid_plant['plant_id_egrid'].unique()))

# Which plants are included in eGRID but are missing from our calculations?
missing_from_calc = egrid_plant[egrid_plant['plant_id_egrid'].isin(plant_not_in_calc)]

#missing_from_calc.to_csv('../data/temp/plants_missing_from_calcs.csv', index=False)

# see if any of these plants are retired
generators_eia860 = load_data.load_pudl_table('generators_eia860', year=year)
missing_from_calc.merge(generators_eia860.groupby('plant_id_eia')['retirement_date'].unique().reset_index(), how='left', on='plant_id_eia')

### Identify plants in our calculations that are missing from eGRID

In [None]:
# Which plants are in our calculations, but are missing from eGRID?
plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]
missing_from_egrid = plant_annual_total[plant_annual_total['plant_id_eia'].isin(plants_not_in_egrid)].merge(plant_names, how='left', on='plant_id_eia')

missing_from_egrid

In [None]:
# how many of the plants missing from egrid have non-zero data
missing_from_egrid[missing_from_egrid['fuel_consumed_mmbtu'] > 1]

### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [None]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = plant_annual_total[plant_annual_total['plant_id_egrid'].duplicated(keep=False)]
double_ids = double_ids.groupby('plant_id_egrid').sum()['net_generation_mwh'].reset_index() # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(egrid_plant[['plant_id_egrid','net_generation_mwh']], how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))
double_ids['percent_diff'] = ((double_ids['net_generation_mwh_calc'] - double_ids['net_generation_mwh_egrid']) / double_ids['net_generation_mwh_egrid']).round(3)
double_ids

### Identify plants where our BA assignment does not match eGRID

In [None]:
ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(plant_annual_total.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

# plants with missing ba code
ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]

In [None]:
# plants with incorrect ba code
ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_calc'].isna())]

## Identify where eGRID might be missing data
It seems that there are quite a few generators where fuel consumption data is missing

In [None]:
# load the EIA generator fuel data
IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
gf = (pudl_out.gf_eia923().loc[:,IDX_PM_ESC + ["net_generation_mwh","fuel_consumed_mmbtu","fuel_consumed_for_electricity_mmbtu",],])

# add egrid plant ids
egrid_crosswalk = pd.read_csv('../data/egrid/egrid_static_tables/2020/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv')
eia_to_egrid_id = dict(zip(list(egrid_crosswalk['plant_id_eia']), list(egrid_crosswalk['plant_id_egrid'])))
gf['plant_id_egrid'] = gf['plant_id_eia']
gf['plant_id_egrid'].update(gf['plant_id_egrid'].map(eia_to_egrid_id))

# calculate an annual total for each plant
gf_total = gf.groupby(['plant_id_egrid']).sum().reset_index()

# choose a metric to compare
metric = 'fuel_consumed_mmbtu'

egrid_eia_comparison = egrid_plant[['plant_id_egrid','plant_name','ba_code','energy_source_code',metric]].merge(gf_total[['plant_id_egrid',metric]], how='left', on='plant_id_egrid', suffixes=('_egrid','_eia923')).round(0)


egrid_eia_comparison['difference'] = egrid_eia_comparison[f'{metric}_egrid'] - egrid_eia_comparison[f'{metric}_eia923']
egrid_eia_comparison['percent_difference'] = (egrid_eia_comparison[f'{metric}_egrid'] - egrid_eia_comparison[f'{metric}_eia923']) / egrid_eia_comparison[f'{metric}_eia923']

In [None]:
# egrid seems to be missing fuel consumption data for most nuclear power plants
missing_nuclear = egrid_eia_comparison[egrid_eia_comparison['energy_source_code'] == 'NUC']
missing_nuclear.sum()

In [None]:
# where is egrid missing data?
egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')]

In [None]:
# how much emissions does this account for?
# group by fuel code
missing_emissions = egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')].groupby('energy_source_code').sum().reset_index()

# get emission factors
emission_factors = load_data.load_emission_factors()[['energy_source_code', 'co2_tons_per_mmbtu']]
missing_emissions = missing_emissions.merge(emission_factors, how='left', on='energy_source_code')
missing_emissions['co2_mass_tons'] = missing_emissions['difference'] * missing_emissions['co2_tons_per_mmbtu']
missing_emissions.sum()

In [None]:
egrid_plant.sum()

## Plant Metric

In [None]:
# standardize column names and index so that the two dfs can be divided
calculated_to_compare = plant_annual_total.groupby('plant_id_egrid').sum().drop(columns=['plant_id_eia'])

# create an adjusted co2 column
# TODO: remove this once we calculate adjusted emissions
#calculated_to_compare['co2_mass_tons_adjusted'] = calculated_to_compare['co2_mass_tons']

# drop the plants that have no data in eGRID
egrid_plant = egrid_plant[~egrid_plant['plant_id_eia'].isin(plants_with_no_data_in_egrid)]

egrid_to_compare = egrid_plant.set_index(['plant_id_egrid']).drop(columns=['ba_code','state','plant_name','plant_id_eia'])

# divide calculated value by egrid value
compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_egrid','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_egrid').set_index('plant_id_egrid')
compared['plant_name'] = compared['plant_name'].fillna('unknown')

# create a dataframe that merges the two sources of data together
compared_merged = calculated_to_compare.merge(egrid_to_compare, how='outer', on='plant_id_egrid', suffixes=('_calc','_egrid'))

# for each column, change missing values to zero if both values are zero (only nan b/c divide by zero)
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    # identify plants with zero values for both
    plant_ids = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plant_ids), col] = 1

# for each column, categorize the data based on how far it is off from egrid
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    # add a new column
    compared[f'{col}_status'] = pd.cut(x=compared[col], 
                                       bins=[-999999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999999], 
                                       labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], 
                                       ordered=False)
    # replace any missing values with missing
    compared[f'{col}_status'] = compared[f'{col}_status'].astype(str)  
    compared[f'{col}_status'] = compared[f'{col}_status'].fillna('missing')
    compared[f'{col}_status'] = compared[f'{col}_status'].replace('nan','missing')
    compared.loc[(compared.index.isin(plants_not_in_egrid)),f'{col}_status'] = 'not_in_egrid'

# identify which plants are missing from egrid vs calculated values
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    # identify plants that are missing in egrid
    plants_missing_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_egrid), f'{col}_status'] = 'missing_in_egrid'
    # identify plants that are missing from our calculations
    plants_missing_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] > 0)].index)
    compared.loc[compared.index.isin(plants_missing_calc), f'{col}_status'] = 'missing_in_calc'
    # identify where our calculations are missing a zero value
    plants_missing_zero_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_missing_zero_calc), f'{col}_status'] = 'missing_zero_in_calc'
    # identify where egrid has a missing value instead of a zero
    plants_missing_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_zero_egrid), f'{col}_status'] = 'missing_zero_in_egrid'
    # identify where egrid has a zero value where we have a positive value
    plants_incorrect_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_incorrect_zero_egrid), f'{col}_status'] = '>50%'

# create a dataframe that counts how many plants are in each category
comparison_count = []
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    count = compared.groupby(f'{col}_status', dropna=False).count()['plant_name'].rename(col)
    count.index = count.index.rename('status')
    comparison_count.append(count)

comparison_count = pd.concat(comparison_count, axis=1).fillna(0).astype(int)
comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0)   
comparison_count

In [None]:
print(comparison_count.to_markdown())

## BA Metric

In [None]:
# load egrid BA totals
"""egrid_ba = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', sheet_name=f'BA{str(year)[-2:]}', header=1, usecols=['BANAME','BACODE','BAHTIANT','BANGENAN','BACO2AN'])
# rename the columns
egrid_ba = egrid_ba.rename(columns={'BANAME':'ba_name',
                                    'BACODE':'ba_code',
                                    'BAHTIANT':'fuel_consumed_mmbtu',
                                    'BANGENAN':'net_generation_mwh',
                                    'BACO2AN':'co2_mass_tons'})"""

data_columns = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_tons','co2_mass_tons_adjusted']

#aggregate the plant data up to the BA level
egrid_ba = egrid_plant.groupby(['ba_code']).sum()[data_columns].reset_index()

# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').replace(0,0.1).div(egrid_ba.set_index('ba_code').replace(0,0.1)).sort_values(by='co2_mass_tons').round(3)

total = pd.DataFrame(plant_annual_total[data_columns].sum().div(egrid_ba[data_columns].sum()).rename('Total')).T

# calculate the difference in the number of plants in each region
plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()

ba_metric = pd.concat([ba_metric, total], axis=0) 

ba_metric = ba_metric[data_columns + ['num_plants']]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ba_metric)

In [None]:
# how much co2 comes from CEMS vs EIA
data_source_by_ba = pd.pivot_table(gen_fuel_allocated, values='co2_mass_tons', index='ba_code', columns='data_source', dropna=False, aggfunc=np.sum).replace(0,0.001)
data_source_by_ba = data_source_by_ba.div(data_source_by_ba.sum(axis=1), axis=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_source_by_ba.round(3).fillna(0).sort_values(by='cems'))

In [None]:
print(ba_metric.to_markdown())

## Explore specific plants

### Notes

Net generation < 50%:
 - 2617 has negative net generation
 - 10444, 59395: some plants are hybrid fossil / solar plants, but the primary energy source code is getting listed as PV for certain generators, which is causing an issue

Net generation > 50%:
 - 335 has two extra units (CT1 and CT2) that report to CEMS but not EIA. Likely correct
 - 7288 reports DFO to CEMS, but NG to EIA - there's probably some generators not reporting
 - 60698 double counting generation from cEMS because of 90% heat threhshold in a month - might be fixed once we tackle heat input
 - 58256 is a solar/hybrid and in generators_entity_eia the battery portion is associated with a separate generator id (should only be 1)


Other:
 - plant 3754 has heat input in cems and eia that don't match
 - 2401 has generation in both cems and eia
 - 50933 might have allocation issue (doesn't appear in CEMS)

To check
 - 1404 reports generation to CEMS in December, but not to EIA. This is likely correct
 - plant 2504 has three units (120, 121, 122) that don't appear in EIA, and in CEMS only report steam. 

Fuel > 50%
 - Plant 3116 reports much more heat input to CEMS during ozone months than to EIA

BA Totals
 - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC


In [None]:
# examine specific plants in a category
value = 'fuel_consumed_mmbtu'
status = '>50%'

#compared_merged.loc[64877,:]

compared[compared[f'{value}_status'] == status].sort_values(by=value)

In [None]:
compared[compared[f'{value}_status'] == status].sort_values(by=value).sample(10)

In [None]:
plant_to_explore = 58380

In [None]:
egrid_plant[egrid_plant['plant_id_eia'] == plant_to_explore]

In [None]:
plant_annual_total[plant_annual_total['plant_id_eia'] == plant_to_explore]

In [None]:
gen_fuel_allocated[gen_fuel_allocated['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems_unit_monthly = cems.groupby(['plant_id_eia','unitid','report_date']).sum().reset_index()


In [None]:
cems_unit_monthly[cems_unit_monthly['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems[(cems['plant_id_eia'] == plant_to_explore) & (cems['operating_time_hours'] > 0)]

In [None]:
cems_plant_annual[cems_plant_annual['plant_id_eia'] == plant_to_explore]

In [None]:
# there are some plants that report heat input and co2 in CEMS, but are missing net generation data
# TODO: we should maybe try and fill net generation data using EIA-923?
cems_plant_annual[cems_plant_annual['net_generation_mwh'] == 0]

## Explore BA Matching
### Notes
TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC

In [None]:
ba_plant_match = plant_annual_total[plant_annual_total['ba_code'].isna()].merge(egrid_plant[egrid_plant['ba_code'].isna()], how='left', on='plant_id_egrid', suffixes=("_calc",'_egrid'))
ba_plant_match[ba_plant_match['net_generation_mwh_calc'].round(0) != ba_plant_match['net_generation_mwh_egrid'].round(0)]

In [None]:
egrid_plant[egrid_plant['ba_code'] == 'CPLE']

In [None]:
compare_plants_in_ba = egrid_plant.merge(plant_annual_total, how='outer', on='plant_id_egrid', suffixes=('_egrid','_calc'))

In [None]:
ba = 'CPLE'
metric = 'fuel_consumed_mmbtu'

compare_plants_in_ba[((compare_plants_in_ba['ba_code_egrid'] == ba) | (compare_plants_in_ba['ba_code_calc'] == ba)) & (compare_plants_in_ba[f'{metric}_egrid'].round(0) != compare_plants_in_ba[f'{metric}_calc'].round(0))]

In [None]:
plant_annual_total[plant_annual_total['ba_code'] == 'AMPL']