In [1]:
# import packages
import pandas as pd

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../hourly-egrid/')

# import local modules
import src.validation as validation

from src.column_checks import get_dtypes

year = 2020
path_prefix = ''
path_prefix = f"{path_prefix}{year}"

# Validate Cleaned EIA-923 Data
Notes:
- When net generation is positive but no fuel consumption is reported, this could be due to several reasons:
    - The generator uses a clean fuel, like WAT, and reports 0 mmbtu per physical unit fuel consumed
    - The generator is part of a combined cycle unit and all of the fuel is reported on the other cycle
- Records where all data is zero could just mean that the generator didn't operate in that month

In [2]:
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])

In [14]:
# perform checks on allocated data
# fuel consumption and co2 emissions should be positive
negative_test = validation.test_for_negative_values(eia923_allocated, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# if net generation is positive, fuel consumption should be non zero
missing_fuel_test = validation.test_for_missing_fuel(eia923_allocated, 'net_generation_mwh')

# fuel consumed for electricity should be less than fuel consumed
chp_allocation_test = validation.test_chp_allocation(eia923_allocated)

# check for missing co2 data
missing_co2_test = validation.test_for_missing_co2(eia923_allocated)

# check for generators with no data
missing_data_test = validation.test_for_missing_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# check for generators with all data = 0
zero_data_test = validation.test_for_zero_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# check for missing energy source code
missing_esc_test = validation.test_for_missing_energy_source_code(eia923_allocated)

# check for missing and incorrect prime movers
incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(eia923_allocated, year)

# check for missing subplant ids
eia_missing_subplant_test = validation.test_for_missing_subplant_id(eia923_allocated)



In [17]:
missing_data_test

Unnamed: 0,report_date,plant_id_eia,generator_id,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,ch4_mass_lb,n2o_mass_lb,co2e_mass_lb,nox_mass_lb,so2_mass_lb,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,co2e_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,co2e_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,co2_mass_lb_for_electricity_adjusted,ch4_mass_lb_for_electricity_adjusted,n2o_mass_lb_for_electricity_adjusted,co2e_mass_lb_for_electricity_adjusted,nox_mass_lb_for_electricity_adjusted,so2_mass_lb_for_electricity_adjusted,subplant_id,prime_mover_code,energy_source_code,hourly_data_source
3,2020-01-01,3,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,ST,NG,cems
4,2020-01-01,3,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,ST,NG,cems
5,2020-01-01,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,ST,BIT,cems
6,2020-01-01,3,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,ST,BIT,cems
7,2020-01-01,3,A1CT,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,CT,NG,cems
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253701,2020-12-01,2070,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,GT,NG,cems
253702,2020-12-01,2070,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,GT,NG,cems
253703,2020-12-01,2070,GTG1,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,CT,NG,cems
253704,2020-12-01,2070,GTG2,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,CT,NG,cems


In [None]:
# check heat rates
heat_rate_test = validation.test_for_outlier_heat_rates(eia923_allocated)

In [6]:
# what percent of emissions is reported in CEMS vs EIA
# NOTE: This does not include emissions only reported by CEMS, so the % may be higher
(eia923_allocated.groupby('hourly_data_source')[["net_generation_mwh","fuel_consumed_mmbtu", 'co2_mass_lb',"co2_mass_lb_for_electricity"]].sum() / eia923_allocated.groupby('hourly_data_source')[["net_generation_mwh","fuel_consumed_mmbtu", 'co2_mass_lb',"co2_mass_lb_for_electricity"]].sum().sum(axis=0)).round(3)

Unnamed: 0_level_0,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_lb,co2_mass_lb_for_electricity
hourly_data_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cems,0.567,0.524,0.833,0.919
eia,0.431,0.474,0.163,0.079
partial_cems,0.001,0.002,0.004,0.002


### Run Validation tests on cleaned CEMS data

In [7]:
cems = pd.read_csv(f'../data/outputs/{path_prefix}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])

In [23]:
# fuel consumption and co2 emissions should be positive
cems_negative_test = validation.test_for_negative_values(cems)

# if net generation is positive, fuel consumption should be non zero
cems_missing_fuel_test = validation.test_for_missing_fuel(cems,'gross_generation_mwh')

# fuel consumed for electricity should be less than fuel consumed
cems_chp_allocation_test = validation.test_chp_allocation(cems)

# check for missing co2 data
cems_missing_co2_test = validation.test_for_missing_co2(cems)

# check for missing energy source code
#cems_missing_esc_test = validation.test_for_missing_energy_source_code(cems)

# test to make sure that there is a complete subplant mapping
cems_missing_subplant_test = validation.test_for_missing_subplant_id(cems)

# test to see if there are any net generation values greater than gross generation
gtn_test = validation.test_gtn_results(cems)




# Make sure that each subplant has been identified from a unique source