In [3]:
# import packages
import numpy as np
import pandas as pd
import plotly.express as px
from IPython.display import display

In [4]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../hourly-egrid/')

# import local modules
import src.load_data as load_data
from src.data_cleaning import assign_ba_code_to_plant
import src.validation as validation

from src.column_checks import get_dtypes, apply_dtypes

# Specify the year for analysis and load the data
This validation can only be run for a single year at a time

In [None]:
year = 2020
cems = pd.read_csv(f'../data/outputs/cems_{year}.csv', parse_dates=['datetime_utc','report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/eia923_allocated_{year}.csv', parse_dates=['report_date'])

In [None]:
# what percent of emissions is reported in CEMS vs EIA
# NOTE: This does not include emissions only reported by CEMS, so the % may be higher
(eia923_allocated.groupby('hourly_data_source')['co2_mass_lb_adjusted'].sum() / eia923_allocated.groupby('hourly_data_source')['co2_mass_lb_adjusted'].sum().sum(axis=0)).round(3)

### Run Validation Tests on Cleaned EIA-923 data
Notes:
- When net generation is positive but no fuel consumption is reported, this could be due to several reasons:
    - The generator uses a clean fuel, like WAT, and reports 0 mmbtu per physical unit fuel consumed
    - The generator is part of a combined cycle unit and all of the fuel is reported on the other cycle
- Records where all data is zero could just mean that the generator didn't operate in that month

In [None]:
# perform checks on allocated data
# fuel consumption and co2 emissions should be positive
negative_test = validation.test_for_negative_values(eia923_allocated, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# if net generation is positive, fuel consumption should be non zero
missing_fuel_test = validation.test_for_missing_fuel(eia923_allocated, 'net_generation_mwh')

# fuel consumed for electricity should be less than fuel consumed
chp_allocation_test = validation.test_chp_allocation(eia923_allocated)

# check for missing co2 data
missing_co2_test = validation.test_for_missing_co2(eia923_allocated)

# check for generators with no data
missing_data_test = validation.test_for_missing_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# check for generators with all data = 0
zero_data_test = validation.test_for_zero_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])

# check for missing energy source code
missing_esc_test = validation.test_for_missing_energy_source_code(eia923_allocated)

# check for missing and incorrect prime movers
incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(eia923_allocated, year)

# check for missing subplant ids
eia_missing_subplant_test = validation.test_for_missing_subplant_id(eia923_allocated)

# check heat rates
heat_rate_test = validation.test_for_outlier_heat_rates(eia923_allocated)

In [None]:
heat_rate_test.sort_values(by='heat_rate')

### Run Validation tests on cleaned CEMS data

In [None]:
# fuel consumption and co2 emissions should be positive
cems_negative_test = validation.test_for_negative_values(cems, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted', 'gross_generation_mwh'])

# if net generation is positive, fuel consumption should be non zero
cems_missing_fuel_test = validation.test_for_missing_fuel(cems,'gross_generation_mwh')

# fuel consumed for electricity should be less than fuel consumed
cems_chp_allocation_test = validation.test_chp_allocation(cems)

# check for missing co2 data
cems_missing_co2_test = validation.test_for_missing_co2(cems)

# check for missing energy source code
cems_missing_esc_test = validation.test_for_missing_energy_source_code(cems)

# test to make sure that there is a complete subplant mapping
cems_missing_subplant_test = validation.test_for_missing_subplant_id(cems)

# test to see if there are any net generation values greater than gross generation
gtn_test = validation.test_gtn_results(cems)


In [None]:
cems_missing_subplant_test[['plant_id_eia','unitid']].drop_duplicates()

In [None]:
cems_missing_esc_test[['plant_id_eia','unitid']].drop_duplicates()

# Data Quality Metrics

### Evalutate data source mismatch

In [5]:
year = 2020
cems = pd.read_csv(f'../data/outputs/{year}/cems_{year}.csv', dtype=get_dtypes())
partial_cems_scaled = pd.read_csv(f'../data/outputs/{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes())
eia923_allocated = pd.read_csv(f'../data/outputs/{year}/eia923_allocated_{year}.csv', dtype=get_dtypes())

plant_attributes = pd.read_csv(f"../data/outputs/{year}/plant_static_attributes_{year}.csv")
eia923_allocated = eia923_allocated.merge(plant_attributes, how="left", on="plant_id_eia")
cems = cems.merge(plant_attributes, how="left", on="plant_id_eia")

In [33]:
partial_cems_scaled = partial_cems_scaled.merge(plant_attributes, how="left", on="plant_id_eia")

In [34]:
ba = "CISO"
fuel = "natural_gas"
test_eia = eia923_allocated[(eia923_allocated["ba_code"] == ba) & (eia923_allocated["fuel_category"] == fuel)]
test_cems = cems[(cems["ba_code"] == ba) & (cems["fuel_category"] == fuel)]
test_pc = partial_cems_scaled[(partial_cems_scaled["ba_code"] == ba) & (partial_cems_scaled["fuel_category"] == fuel)]

In [7]:
test_eia.groupby('hourly_data_source').sum()['net_generation_mwh']

hourly_data_source
cems            55542242.5
eia             20288382.6
partial_cems       66089.0
Name: net_generation_mwh, dtype: float64

In [10]:
test_cems[["gross_generation_mwh",'net_generation_mwh']].sum()

gross_generation_mwh    5.907727e+07
net_generation_mwh      4.246262e+07
dtype: float64

In [38]:
test_pc[['net_generation_mwh']].sum()

net_generation_mwh    51859.426708
dtype: float64

In [42]:
test_pc = test_pc.drop(columns='source')

In [44]:
test_cems = test_cems.drop(columns='source')

In [45]:
partial_cems_subplant_months = test_pc[
    ["plant_id_eia", "subplant_id", "report_date"]
].drop_duplicates()
filtered_cems = test_cems.merge(
    partial_cems_subplant_months,
    how="outer",
    on=["plant_id_eia", "subplant_id", "report_date"],
    indicator="source",
)

filtered_cems = filtered_cems[filtered_cems["source"] == "left_only"].drop(
    columns=["source"]
)

In [47]:
filtered_cems[["gross_generation_mwh",'net_generation_mwh']].sum()

gross_generation_mwh    5.902732e+07
net_generation_mwh      4.243357e+07
dtype: float64

In [56]:
subplants_ided_as_cems = test_eia.loc[test_eia["hourly_data_source"] == 'cems', ["plant_id_eia","subplant_id"]].drop_duplicates()
subplants_in_cems = filtered_cems[["plant_id_eia","subplant_id"]].drop_duplicates()
cems_overlap = subplants_ided_as_cems.merge(subplants_in_cems, how="outer", on=["plant_id_eia","subplant_id"], indicator="source")
cems_overlap

Unnamed: 0,plant_id_eia,subplant_id,source
0,260,2,both
1,260,3,both
2,315,2,both
3,315,4,both
4,335,1,both
...,...,...,...
138,7449,0,both
139,60698,,both
140,315,,right_only
141,335,,right_only


In [57]:
subplants_ided_as_pc = test_eia.loc[test_eia["hourly_data_source"] == 'partial_cems', ["plant_id_eia","subplant_id"]].drop_duplicates()
subplants_in_pc = test_pc[["plant_id_eia","subplant_id"]].drop_duplicates()
pc_overlap = subplants_ided_as_pc.merge(subplants_in_pc, how="outer", on=["plant_id_eia","subplant_id"], indicator="source")
pc_overlap

Unnamed: 0,plant_id_eia,subplant_id,source
0,55295,0,both
1,55748,0,left_only
2,56026,0,both
3,55333,0,both
4,55393,0,both


In [61]:
test_cems.loc[test_cems['plant_id_eia'] == 55748, "net_generation_mwh"].sum()

249125.05209945003

### Compare monthly totals for each plant from each data source

In [None]:
# for plants where there is data reported in cems, see how off it is from data reported in eia
cems_plant_monthly = cems.groupby(['plant_id_eia','subplant_id','report_date'], dropna=False).sum()[['gross_generation_mwh','net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].reset_index()
gf_plant_monthly = eia923_allocated.groupby(['plant_id_eia','subplant_id','report_date'], dropna=False).sum().reset_index()
compare_cems_eia = gf_plant_monthly.merge(cems_plant_monthly, how='inner', on=['plant_id_eia','subplant_id','report_date'], suffixes=("_eia",'_cems'))


for column in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']:
    compare_cems_eia[f'{column}_pctdiff'] = ((compare_cems_eia[f'{column}_cems'].replace(0,0.1) - compare_cems_eia[f'{column}_eia'].replace(0,0.1)) / compare_cems_eia[f'{column}_eia'].replace(0,0.1)).round(3)

compare_cems_eia = compare_cems_eia.set_index(['plant_id_eia','subplant_id','report_date'])
compare_cems_eia = compare_cems_eia.reindex(sorted(compare_cems_eia.columns), axis=1)

In [None]:
# identify where there are differences between reported CEMS and EIA values for the same subplant-month
value = 'net_generation_mwh'

comparison = compare_cems_eia[[f'{value}_cems', f'{value}_eia', f'{value}_pctdiff']]
comparison[(~comparison[f'{value}_pctdiff'].between(-0.05,0.05))]

## Compare Our Results to eGRID

In [None]:
# filter the data for which we only have EIA data
monthly_eia_data_to_distribute = eia923_allocated[(eia923_allocated['hourly_data_source'] == 'eia') & ~(eia923_allocated['fuel_consumed_mmbtu'].isna())]

# assign ba codes to the data
monthly_eia_data_to_distribute = assign_ba_code_to_plant(monthly_eia_data_to_distribute, year)
cems = assign_ba_code_to_plant(cems, year)

In [None]:
# Aggregate total calculated values
###################################

# Aggregate cems and eia data by plant id, then combine
cems_plant_annual = cems.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].reset_index()
eia_plant_annual = monthly_eia_data_to_distribute.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].reset_index()
plant_annual_total = pd.concat([cems_plant_annual,eia_plant_annual], axis=0)
# group any plants that have records from both datasets
plant_annual_total = plant_annual_total.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum().reset_index()

# add a egrid id
plant_annual_total = validation.add_egrid_plant_id(plant_annual_total, from_id='eia', to_id='egrid')

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

### Identify plants in eGRID missing from our results

These plants seem to have retired before 2020

In [None]:
# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid
plant_not_in_calc = list(set(egrid_plant['plant_id_eia'].unique()) - set(plant_annual_total['plant_id_eia'].unique()))

# Which plants are included in eGRID but are missing from our calculations?
missing_from_calc = egrid_plant[egrid_plant['plant_id_egrid'].isin(plant_not_in_calc)]

# see if any of these plants are retired
generators_eia860 = load_data.load_pudl_table('generators_eia860', year=year)
missing_from_calc.merge(generators_eia860.groupby('plant_id_eia')['retirement_date'].unique().reset_index(), how='left', on='plant_id_eia')

### Identify plants in our calculations that are missing from eGRID

In [None]:
# Which plants are in our calculations, but are missing from eGRID?
plants_not_in_egrid = list(set(plant_annual_total['plant_id_egrid'].unique()) - set(egrid_plant['plant_id_egrid'].unique()))

plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]
missing_from_egrid = plant_annual_total[plant_annual_total['plant_id_egrid'].isin(plants_not_in_egrid)].merge(plant_names, how='left', on='plant_id_eia')

missing_from_egrid

In [None]:
# how many of the plants missing from egrid have non-zero data
missing_from_egrid[missing_from_egrid['fuel_consumed_mmbtu'] > 1].count()

### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [None]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = plant_annual_total[plant_annual_total['plant_id_egrid'].duplicated(keep=False)]
double_ids = double_ids.groupby('plant_id_egrid').sum()['net_generation_mwh'].reset_index() # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(egrid_plant[['plant_id_egrid','net_generation_mwh']], how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))
double_ids['percent_diff'] = ((double_ids['net_generation_mwh_calc'] - double_ids['net_generation_mwh_egrid']) / double_ids['net_generation_mwh_egrid']).round(3)
double_ids

### Identify plants where our BA assignment does not match eGRID

In [None]:
ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(plant_annual_total.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

# plants with missing ba code
ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]

In [None]:
# plants with incorrect ba code
ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_calc'].isna())]

## Identify where eGRID might be missing data
It seems that there are quite a few generators where fuel consumption data is missing

In [None]:
pudl_out = load_data.initialize_pudl_out(year)

# load the EIA generator fuel data
IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
gf = pudl_out.gf_eia923().loc[
    :,
    IDX_PM_ESC
    + [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
    ],
]

# add egrid plant ids
egrid_crosswalk = pd.read_csv(
    "../data/manual/egrid_static_tables/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv"
)
eia_to_egrid_id = dict(
    zip(list(egrid_crosswalk["plant_id_eia"]), list(egrid_crosswalk["plant_id_egrid"]))
)
gf["plant_id_egrid"] = gf["plant_id_eia"]
gf["plant_id_egrid"].update(gf["plant_id_egrid"].map(eia_to_egrid_id))

# calculate an annual total for each plant
gf_total = gf.groupby(["plant_id_egrid"]).sum().reset_index()

# choose a metric to compare
metric = "fuel_consumed_mmbtu"

# merge the annual EIA-923 data into the egrid data
egrid_eia_comparison = (
    egrid_plant[
        ["plant_id_egrid", "plant_name", "ba_code", "energy_source_code", metric]
    ]
    .merge(
        gf_total[["plant_id_egrid", metric]],
        how="outer",
        on="plant_id_egrid",
        suffixes=("_egrid", "_eia923"),
        indicator="source",
    )
    .round(0)
)
egrid_eia_comparison[f"{metric}_egrid"] = egrid_eia_comparison[
    f"{metric}_egrid"
].fillna(0)
# calculate an absolute difference and percent difference between the two values
egrid_eia_comparison["difference"] = (
    egrid_eia_comparison[f"{metric}_egrid"] - egrid_eia_comparison[f"{metric}_eia923"]
)
egrid_eia_comparison["percent_difference"] = (
    egrid_eia_comparison[f"{metric}_egrid"] - egrid_eia_comparison[f"{metric}_eia923"]
) / egrid_eia_comparison[f"{metric}_eia923"]
egrid_eia_comparison.loc[
    egrid_eia_comparison["difference"] == 0, "percent_difference"
] = 0




In [None]:
# add cems data
cems_total = cems.copy()[['plant_id_eia',metric]]
cems_total["plant_id_egrid"] = cems_total["plant_id_eia"]
cems_total["plant_id_egrid"].update(cems_total["plant_id_egrid"].map(eia_to_egrid_id))
cems_total = cems_total.groupby('plant_id_egrid').sum()[metric].reset_index().rename(columns={metric:f"{metric}_cems"})

# merge cems data into egrid 
egrid_eia_comparison = egrid_eia_comparison.merge(cems_total, how='outer', on='plant_id_egrid')

In [None]:
egrid_eia_comparison

In [None]:
egrid_eia_comparison[egrid_eia_comparison['source'] == 'left_only']

In [None]:
# egrid seems to be missing fuel consumption data for most nuclear power plants
missing_nuclear = egrid_eia_comparison[egrid_eia_comparison['energy_source_code'] == 'NUC']
missing_nuclear.sum()

In [None]:
egrid_eia_comparison[(egrid_eia_comparison['percent_difference']  < - 0.01)]

In [None]:
# where is egrid missing data?
egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')]#.sort_values(by='percent_difference').head(20)

In [None]:
# how much emissions does this account for?
# group by fuel code
missing_emissions = egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')].groupby('energy_source_code').sum().reset_index()

# get emission factors
emission_factors = load_data.load_ghg_emission_factors()[['energy_source_code', 'co2_lb_per_mmbtu']]
missing_emissions = missing_emissions.merge(emission_factors, how='left', on='energy_source_code')
missing_emissions['co2_mass_lb'] = missing_emissions['difference'] * missing_emissions['co2_lb_per_mmbtu']
missing_emissions.sum()

## Plant Metric

In [None]:
# standardize column names and index so that the two dfs can be divided
calculated_to_compare = plant_annual_total.groupby('plant_id_egrid').sum().drop(columns=['plant_id_eia'])

# drop the plants that have no data in eGRID
plants_with_no_data_in_egrid = list(egrid_plant[egrid_plant[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].sum(axis=1) == 0]['plant_id_egrid'])
egrid_plant = egrid_plant[~egrid_plant['plant_id_eia'].isin(plants_with_no_data_in_egrid)]

egrid_to_compare = egrid_plant.set_index(['plant_id_egrid']).drop(columns=['ba_code','state','plant_name','plant_id_eia'])

# divide calculated value by egrid value
compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_egrid','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_egrid').set_index('plant_id_egrid')
compared['plant_name'] = compared['plant_name'].fillna('unknown')

# create a dataframe that merges the two sources of data together
compared_merged = calculated_to_compare.merge(egrid_to_compare, how='outer', on='plant_id_egrid', suffixes=('_calc','_egrid'))

# for each column, change missing values to zero if both values are zero (only nan b/c divide by zero)
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    # identify plants with zero values for both
    plant_ids = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plant_ids), col] = 1

# for each column, categorize the data based on how far it is off from egrid
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    # add a new column
    compared[f'{col}_status'] = pd.cut(x=compared[col], 
                                       bins=[-999999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999999], 
                                       labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], 
                                       ordered=False)
    # replace any missing values with missing
    compared[f'{col}_status'] = compared[f'{col}_status'].astype(str)  
    compared[f'{col}_status'] = compared[f'{col}_status'].fillna('missing')
    compared[f'{col}_status'] = compared[f'{col}_status'].replace('nan','missing')
    compared.loc[(compared.index.isin(plants_not_in_egrid)),f'{col}_status'] = 'not_in_egrid'

# identify which plants are missing from egrid vs calculated values
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    # identify plants that are missing in egrid
    plants_missing_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_egrid), f'{col}_status'] = 'missing_in_egrid'
    # identify plants that are missing from our calculations
    plants_missing_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] > 0)].index)
    compared.loc[compared.index.isin(plants_missing_calc), f'{col}_status'] = 'missing_in_calc'
    # identify where our calculations are missing a zero value
    plants_missing_zero_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_missing_zero_calc), f'{col}_status'] = 'missing_zero_in_calc'
    # identify where egrid has a missing value instead of a zero
    plants_missing_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_zero_egrid), f'{col}_status'] = 'missing_zero_in_egrid'
    # identify where egrid has a zero value where we have a positive value
    plants_incorrect_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_incorrect_zero_egrid), f'{col}_status'] = '>50%'

# create a dataframe that counts how many plants are in each category
comparison_count = []
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    count = compared.groupby(f'{col}_status', dropna=False).count()['plant_name'].rename(col)
    count.index = count.index.rename('status')
    comparison_count.append(count)

comparison_count = pd.concat(comparison_count, axis=1).fillna(0).astype(int)
comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0)   
comparison_count

In [None]:
print(comparison_count.to_markdown())

## BA Metric

In [None]:
# load egrid BA totals
"""egrid_ba = pd.read_excel(f'../data/downloads/egrid/egrid{year}_data.xlsx', sheet_name=f'BA{str(year)[-2:]}', header=1, usecols=['BANAME','BACODE','BAHTIANT','BANGENAN','BACO2AN'])
# rename the columns
egrid_ba = egrid_ba.rename(columns={'BANAME':'ba_name',
                                    'BACODE':'ba_code',
                                    'BAHTIANT':'fuel_consumed_mmbtu',
                                    'BANGENAN':'net_generation_mwh',
                                    'BACO2AN':'co2_mass_lb'})"""

data_columns = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']

#aggregate the plant data up to the BA level
egrid_ba = egrid_plant.groupby(['ba_code']).sum()[data_columns].reset_index()

# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').replace(0,0.1).div(egrid_ba.set_index('ba_code').replace(0,0.1)).sort_values(by='co2_mass_lb').round(3)

total = pd.DataFrame(plant_annual_total[data_columns].sum().div(egrid_ba[data_columns].sum()).rename('Total')).T

# calculate the difference in the number of plants in each region
plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()

ba_metric = pd.concat([ba_metric, total], axis=0).round(2) 

ba_metric = ba_metric[data_columns + ['num_plants']]

columns_to_check = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb']

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])

In [None]:
# how much co2 comes from CEMS vs EIA
data_source_by_ba = pd.pivot_table(eia923_allocated, values='co2_mass_lb', index='ba_code', columns='data_source', dropna=False, aggfunc=np.sum).replace(0,0.001)
data_source_by_ba = data_source_by_ba.div(data_source_by_ba.sum(axis=1), axis=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_source_by_ba.round(3).fillna(0).sort_values(by='cems'))

In [None]:
print(ba_metric.to_markdown())

## Explore specific plants

### Notes

Net generation < 50%:
 - 2617 has negative net generation
 - 10444, 59395: some plants are hybrid fossil / solar plants, but the primary energy source code is getting listed as PV for certain generators, which is causing an issue

Net generation > 50%:
 - 335 has two extra units (CT1 and CT2) that report to CEMS but not EIA. Likely correct
 - 7288 reports DFO to CEMS, but NG to EIA - there's probably some generators not reporting
 - 60698 double counting generation from cEMS because of 90% heat threhshold in a month - might be fixed once we tackle heat input
 - 58256 is a solar/hybrid and in generators_entity_eia the battery portion is associated with a separate generator id (should only be 1)


Other:
 - plant 3754 has heat input in cems and eia that don't match
 - 2401 has generation in both cems and eia
 - 50933 might have allocation issue (doesn't appear in CEMS)

To check
 - 1404 reports generation to CEMS in December, but not to EIA. This is likely correct
 - plant 2504 has three units (120, 121, 122) that don't appear in EIA, and in CEMS only report steam. 

Fuel > 50%
 - Plant 3116 reports much more heat input to CEMS during ozone months than to EIA

BA Totals
 - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC


In [None]:
# examine specific plants in a category
value = 'fuel_consumed_mmbtu'
status = '>50%'

#compared_merged.loc[64877,:]

compared[compared[f'{value}_status'] == status].sort_values(by=value)

In [None]:
compared[compared[f'{value}_status'] == status].sort_values(by=value).sample(10)

In [None]:
plant_to_explore = 58380

In [None]:
egrid_plant[egrid_plant['plant_id_eia'] == plant_to_explore]

In [None]:
plant_annual_total[plant_annual_total['plant_id_eia'] == plant_to_explore]

In [None]:
eia923_allocated[eia923_allocated['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems_unit_monthly = cems.groupby(['plant_id_eia','unitid','report_date']).sum().reset_index()


In [None]:
cems_unit_monthly[cems_unit_monthly['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems[(cems['plant_id_eia'] == plant_to_explore) & (cems['operating_time_hours'] > 0)]

In [None]:
cems_plant_annual[cems_plant_annual['plant_id_eia'] == plant_to_explore]

In [None]:
# there are some plants that report heat input and co2 in CEMS, but are missing net generation data
# TODO: we should maybe try and fill net generation data using EIA-923?
cems_plant_annual[cems_plant_annual['net_generation_mwh'] == 0]

## Explore BA Matching
### Notes
TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC

In [None]:
ba_plant_match = plant_annual_total[plant_annual_total['ba_code'].isna()].merge(egrid_plant[egrid_plant['ba_code'].isna()], how='left', on='plant_id_egrid', suffixes=("_calc",'_egrid'))
ba_plant_match[ba_plant_match['net_generation_mwh_calc'].round(0) != ba_plant_match['net_generation_mwh_egrid'].round(0)]

In [None]:
egrid_plant[egrid_plant['ba_code'] == 'CPLE']

In [None]:
compare_plants_in_ba = egrid_plant.merge(plant_annual_total, how='outer', on='plant_id_egrid', suffixes=('_egrid','_calc'))

In [None]:
ba = 'CPLE'
metric = 'fuel_consumed_mmbtu'

compare_plants_in_ba[((compare_plants_in_ba['ba_code_egrid'] == ba) | (compare_plants_in_ba['ba_code_calc'] == ba)) & (compare_plants_in_ba[f'{metric}_egrid'].round(0) != compare_plants_in_ba[f'{metric}_calc'].round(0))]

In [None]:
plant_annual_total[plant_annual_total['ba_code'] == 'AMPL']