In [17]:
# import packages
import numpy as np
import pandas as pd
import plotly.express as px
from IPython.display import display

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../hourly-egrid/')

# import local modules
import src.validation as validation
import src.data_cleaning as data_cleaning

from src.column_checks import get_dtypes, apply_dtypes

year = 2020
path_prefix = ''
path_prefix = f"{path_prefix}{year}"

# Validate Cleaned EIA-923 Data
Notes:
- When net generation is positive but no fuel consumption is reported, this could be due to several reasons:
    - The generator uses a clean fuel, like WAT, and reports 0 mmbtu per physical unit fuel consumed
    - The generator is part of a combined cycle unit and all of the fuel is reported on the other cycle
- Records where all data is zero could just mean that the generator didn't operate in that month

In [3]:
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])

In [5]:
# perform checks on allocated data
# fuel consumption and co2 emissions should be positive
negative_test = validation.test_for_negative_values(eia923_allocated, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# if net generation is positive, fuel consumption should be non zero
missing_fuel_test = validation.test_for_missing_fuel(eia923_allocated, 'net_generation_mwh')

# fuel consumed for electricity should be less than fuel consumed
chp_allocation_test = validation.test_chp_allocation(eia923_allocated)

# check for missing co2 data
missing_co2_test = validation.test_for_missing_co2(eia923_allocated)

# check for generators with no data
missing_data_test = validation.test_for_missing_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# check for generators with all data = 0
zero_data_test = validation.test_for_zero_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# check for missing energy source code
missing_esc_test = validation.test_for_missing_energy_source_code(eia923_allocated)

# check for missing and incorrect prime movers
incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(eia923_allocated, year)

# check for missing subplant ids
eia_missing_subplant_test = validation.test_for_missing_subplant_id(eia923_allocated)



In [6]:
# check heat rates
heat_rate_test = validation.test_for_outlier_heat_rates(eia923_allocated)

Heat Rate Test
             median = 13.61, max = 1533.0, min = 3.05
             median = 0.0, max = 0.0, min = 0.0
             median = 4.21, max = 302.34, min = 0.61
             median = 9.85, max = 14939.09, min = -0.0
             median = 5.26, max = 1230.74, min = 0.0
             median = 0.0, max = 10.26, min = 0.0
             median = 11.79, max = 4304.27, min = 5.35
             median = 17.7, max = 219.0, min = -0.0
             median = 10.43, max = 1472.0, min = -0.0
             median = 8.77, max = 8.78, min = 8.77
             median = 8.77, max = 8.77, min = 8.77
             median = 16.27, max = 30.17, min = 13.16
             median = 15.0, max = 191.0, min = -0.0
             median = 0.0, max = 15.02, min = 0.0
             median = 13.78, max = 118.82, min = 3.31
             median = 11.89, max = 75.21, min = 4.11
             median = 14.12, max = 108.12, min = 6.67
             median = 11.8, max = 106.39, min = 5.01
             median = 19.37, max = 726.

In [11]:
# what percent of emissions is reported in CEMS vs EIA
# NOTE: This does not include emissions only reported by CEMS, so the % may be higher
(eia923_allocated.groupby('hourly_data_source')[["net_generation_mwh","fuel_consumed_mmbtu", 'co2_mass_lb',"co2_mass_lb_for_electricity"]].sum() / eia923_allocated.groupby('hourly_data_source')[["net_generation_mwh","fuel_consumed_mmbtu", 'co2_mass_lb',"co2_mass_lb_for_electricity"]].sum().sum(axis=0)).round(3)

Unnamed: 0_level_0,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_lb,co2_mass_lb_for_electricity
hourly_data_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cems,0.569,0.512,0.811,0.893
eia,0.43,0.486,0.185,0.105
partial_cems,0.002,0.002,0.003,0.003


### Run Validation tests on cleaned CEMS data

In [18]:
cems = pd.read_csv(f'../data/outputs/{path_prefix}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])

In [14]:
# fuel consumption and co2 emissions should be positive
cems_negative_test = validation.test_for_negative_values(cems, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted', 'gross_generation_mwh'])

# if net generation is positive, fuel consumption should be non zero
cems_missing_fuel_test = validation.test_for_missing_fuel(cems,'gross_generation_mwh')

# fuel consumed for electricity should be less than fuel consumed
cems_chp_allocation_test = validation.test_chp_allocation(cems)

# check for missing co2 data
cems_missing_co2_test = validation.test_for_missing_co2(cems)

# check for missing energy source code
cems_missing_esc_test = validation.test_for_missing_energy_source_code(cems)

# test to make sure that there is a complete subplant mapping
cems_missing_subplant_test = validation.test_for_missing_subplant_id(cems)

# test to see if there are any net generation values greater than gross generation
gtn_test = validation.test_gtn_results(cems)




In [15]:
cems_missing_subplant_test[['plant_id_eia','unitid']].drop_duplicates()

Unnamed: 0,plant_id_eia,unitid
614358,202,1
616517,203,1
1245077,59338,1CTGA
1262645,59338,1CTGB
1280213,60768,2CTGA
...,...,...
28061681,7504,1
28122448,55479,1
28131232,56319,1
28140016,56596,1


In [16]:
cems_missing_esc_test[['plant_id_eia','unitid']].drop_duplicates()

Unnamed: 0,plant_id_eia,unitid
3586599,60698,D1
3591736,60698,D7
12564109,55088,GT2100
12572893,55088,GT3100
19767709,10244,B002
19771382,10244,B003
20899238,50397,36
21626322,50397,39
22767879,50481,253-26
27014394,3982,1


# Make sure that each subplant has been identified from a unique source

In [19]:
partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=["report_date"])

# aggregate cems data to subplant level
cems_filtered = data_cleaning.aggregate_cems_to_subplant(cems)

# drop data from cems that is now in partial_cems
cems_filtered = data_cleaning.filter_unique_cems_data(cems_filtered, partial_cems_scaled)

# merge the plant attributes into each dataframe
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}/plant_static_attributes_{year}.csv")
eia923_allocated = eia923_allocated.merge(plant_attributes, how="left", on="plant_id_eia")
cems_filtered = cems_filtered.merge(plant_attributes, how="left", on="plant_id_eia")
partial_cems_scaled = partial_cems_scaled.merge(plant_attributes, how="left", on="plant_id_eia")

In [20]:
# create a list of all of the subplants that were IDed as CEMS
subplants_ided_as_cems = eia923_allocated.loc[eia923_allocated["hourly_data_source"] == 'cems', ["plant_id_eia","subplant_id"]].drop_duplicates()
# create a list of all subplants that actually exist in the cems data
subplants_in_cems = cems_filtered[["plant_id_eia","subplant_id"]].drop_duplicates()

# merge the data. Ideally, the source should be "both" for all subplants
# left_only means that we have incorrectly ided some plants as being in cems when they are not
# right only means that there are plants in cems that don't exist in EIA - this is okay because it means that there is not overlap
cems_overlap = subplants_ided_as_cems.merge(subplants_in_cems, how="outer", on=["plant_id_eia","subplant_id"], indicator="source")
cems_overlap[cems_overlap["source"] != "both"]

Unnamed: 0,plant_id_eia,subplant_id,source
3100,315,,right_only
3101,335,,right_only
3102,1378,,right_only
3103,1702,,right_only
3104,2503,,right_only
3105,2828,,right_only
3106,2953,2.0,right_only
3107,2953,3.0,right_only
3108,3399,,right_only
3109,3406,,right_only


In [22]:
# create a list of all of the subplants that were IDed as partial cems
subplants_ided_as_pc = eia923_allocated.loc[eia923_allocated["hourly_data_source"] == 'partial_cems', ["plant_id_eia","subplant_id"]].drop_duplicates()
# create a list of all subplants that actually exist in the partial cems data
subplants_in_pc = partial_cems_scaled[["plant_id_eia","subplant_id"]].drop_duplicates()

# merge the data. Ideally, the source should be "both" for all subplants
# left_only means that we have incorrectly ided some plants as being in cems when they are not
# right only means that there are plants in cems that don't exist in EIA - this is okay because it means that there is not overlap
pc_overlap = subplants_ided_as_pc.merge(subplants_in_pc, how="outer", on=["plant_id_eia","subplant_id"], indicator="source")
pc_overlap[pc_overlap["source"] != "both"]

Unnamed: 0,plant_id_eia,subplant_id,source
