In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import pudl
import sqlalchemy as sa
import logging
import sys
import pathlib
from pudl.analysis.allocate_net_gen import *

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('dark_background')
plt.rcParams["figure.figsize"] = (15,10)

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

pd.options.display.max_columns = None

In [None]:
from pudl.workspace.setup import PudlPaths

pudl_engine = sa.create_engine(PudlPaths().pudl_db)

### Explore Solutions

In [None]:
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine,freq='AS', end_date="2019-12-31")

In [None]:
%%time
gen_allocated = pudl_out.gen_allocated_eia923(update=True)

In [None]:
plt.plot(pudl_out.gf_eia923()
         .groupby(by='report_date',).sum().fuel_consumed_mmbtu,
         label='Fuel Table', linewidth=6, color='turquoise'
        )
plt.plot(pudl_out.gen_allocated_eia923()
         .groupby(by='report_date',).sum().fuel_consumed_mmbtu,
         label='Reassigned',linewidth=6, color='deeppink'
        )

plt.legend()
plt.ylabel("Total Fuel Consumed (mmBTU)")
plt.xlabel("year")
plt.title("Reassigned fuel consumed v fuel table", size=14)
plt.show()

In [None]:
plt.plot(pudl_out.gf_eia923()
         .groupby(by='report_date',).sum().fuel_consumed_mmbtu,
         label='Fuel Table', linewidth=6, color='turquoise'
        )
plt.plot(pudl_out.gen_allocated_eia923()
         .groupby(by='report_date',).sum().fuel_consumed_mmbtu,
         label='Reassigned',linewidth=6, color='deeppink'
        )

plt.legend()
plt.ylabel("Total Fuel Consumed (mmBTU)")
plt.xlabel("year")
plt.title("Reassigned fuel consumed v fuel table", size=14)
plt.show()

In [None]:
plt.plot(pudl_out.gf_eia923()
         .groupby(by='report_date',dropna=False).sum().net_generation_mwh,
         label='Fuel Table Net Gen', linewidth=6, color='turquoise'
        )
plt.plot(pudl_out.gen_allocated_eia923()
         .groupby(by='report_date',dropna=False).sum().net_generation_mwh,
         label='Reassigned Net Gen', linewidth=6, color='deeppink'
        )

plt.legend()
plt.ylabel("Total Net Gen (MWh)")
plt.xlabel("year")
plt.title("Reassigned Net gen v fuel table", size=14)
plt.show()

In [None]:
plt.plot(pudl_out.gf_eia923()
         .groupby(by='report_date',dropna=False).sum().net_generation_mwh,
         label='Fuel Table Net Gen', linewidth=6, color='turquoise'
        )
plt.plot(pudl_out.gen_allocated_eia923()
         .groupby(by='report_date',dropna=False).sum().net_generation_mwh,
         label='Reassigned Net Gen', linewidth=6, color='deeppink'
        )

plt.legend()
plt.ylabel("Total Net Gen (MWh)")
plt.xlabel("year")
plt.title("Reassigned Net gen v fuel table", size=14)
plt.show()

### Explore Problem

In [None]:
gens = pudl_out.gens_eia860()
own = pudl_out.own_eia860()
gen = pudl_out.gen_eia923()
gf = pudl_out.gf_eia923()

In [None]:
net_gen_diff_val = gen.net_generation_mwh.sum() / gf.net_generation_mwh.sum()
logger.info(f"{net_gen_diff_val:.02%} net generation missing from gen table")

idx_cols_plant = ['plant_id_eia','report_date']
net_gen_diff = (
    pd.merge(
        gen.groupby(by=idx_cols_plant).agg({'net_generation_mwh':'sum'}),
        gf.groupby(by=idx_cols_plant)[['net_generation_mwh']].sum(),
        right_index=True,
        left_index=True,
        suffixes=('_gen','_gf'),
        how='outer'
    )
    .assign(
        net_gen_diff_num=lambda x: x.net_generation_mwh_gf.fillna(0) - x.net_generation_mwh_gen.fillna(0),
        net_gen_diff_rate=lambda x: x.net_generation_mwh_gen.fillna(0)/ x.net_generation_mwh_gf.fillna(0)
    )
    .merge(
        gens.groupby(by=idx_cols_plant)
        [['generator_id','prime_mover_code', 'fuel_type_code_pudl']]
        .nunique()
        .add_suffix('_count'),
        right_index=True,
        left_index=True,
        how='outer'
    )
    .reset_index()
)

diff_mask = (net_gen_diff.net_gen_diff_num.notnull()
    & (net_gen_diff.net_gen_diff_num != 0)
    & (~np.isclose(abs(net_gen_diff.net_gen_diff_rate), 1))
    & (~np.isclose(abs(net_gen_diff.net_gen_diff_rate), 0)))

logger.info(len(net_gen_diff[net_gen_diff.net_gen_diff_num < 0]))
logger.info(len(net_gen_diff[net_gen_diff.net_gen_diff_num >= 0]))
logger.info(len(net_gen_diff[net_gen_diff.net_gen_diff_num.isnull()]))

diff_w_many_pms = net_gen_diff[(net_gen_diff.prime_mover_code_count > 1) & diff_mask]
diff_w_one_pm = net_gen_diff[(net_gen_diff.prime_mover_code_count == 1) & diff_mask]
logger.info(
    f"{len(diff_w_many_pms)/len(net_gen_diff[diff_mask]):.02%} of records w/ gen v gen_fuel diff that have >1 prime mover")
logger.info(
    f"{len(net_gen_diff[net_gen_diff.net_gen_diff_rate > 1.001])/len(net_gen_diff):.02%} of records that have more net gen in net gen table"
)

In [None]:
plt.plot((net_gen_diff.groupby(by='report_date',dropna=False).sum().net_gen_diff_num
          /net_gen_diff.groupby(by='report_date',dropna=False).sum().net_generation_mwh_gf),
         label='all generators', linewidth=6, color='turquoise'
        )
plt.plot((net_gen_diff[(net_gen_diff.net_generation_mwh_gen.notnull())]
          .groupby(by='report_date',dropna=False).sum().net_gen_diff_num
          /net_gen_diff.groupby(by='report_date',dropna=False).sum().net_generation_mwh_gf
         ),
         label='only co-reported*', linewidth=6, color='deeppink'
        )

plt.legend()
plt.ylabel("Normalized Net Gen Difference")
plt.xlabel("*this does not include the plants that don't report to gen at all")
plt.title("Net gen from gen v gen fuel table", size=14)
plt.show()

In [None]:
for year in range(2009,2015):
    df = net_gen_diff[diff_mask & (net_gen_diff.report_date.dt.year == year)]
    plt.hist(
        df.net_gen_diff_rate,
        bins=150,
        range=(-.25,1.5),
        label=year,
        weights=abs(df.net_generation_mwh_gf)
    )
plt.title("Pre-2015 ratio of net gen from gen table vs gen fuel table", size=16)
plt.legend()
plt.xlabel("""ratio
note: All records that had nearly identical net generation has been removed""")
plt.show()

In [None]:
for year in range(2015,2019):
    df = net_gen_diff[diff_mask & (net_gen_diff.report_date.dt.year == year)]
    plt.hist(
        df.net_gen_diff_rate,
        bins=150,
        range=(-.25,1.5),
        label=year,
        weights=abs(df.net_generation_mwh_gf)
    )
plt.title("Post-2014 ratio of net gen from gen table vs gen fuel table", size=16)
plt.legend()
plt.xlabel("""ratio
note: All records that had nearly identical net generation has been removed""")
plt.show()

In [None]:
for year in range(2009,2019):
    df = net_gen_diff[diff_mask & (net_gen_diff.report_date.dt.year == year)]
    plt.hist(
        df.net_gen_diff_rate,
        bins=200,
        range=(1.001,2),
        label=year,
        weights=abs(df.net_generation_mwh_gf)
    )
plt.title("Bad Ratios of net gen from gen table vs gen fuel table", size=16)
plt.legend()
plt.xlabel("""ratio
note: All records that had nearly identical net generation has been removed""")
plt.show()

In [None]:
idx_cols_gen = idx_cols_plant + ['generator_id']
gens_to_gen = pd.merge(
    gen[idx_cols_gen],
    gens[idx_cols_gen + ['prime_mover_code']],
    right_on=idx_cols_gen,
    left_on=idx_cols_gen,
    how='outer',
    indicator=True
)

pms_missing = (gens_to_gen[gens_to_gen._merge == 'right_only']
               .groupby(by=idx_cols_plant)[['prime_mover_code']].nunique())
pm_perc = len(pms_missing[pms_missing.prime_mover_code == 1])/len(pms_missing)

logger.info("Portion of plants for which the generators missing "
            f"from the gen table are all only one prime_move_code: {pm_perc:.02%}")

In [None]:
pre= "> 2014"
net_gen_diff_val_pre2014 = (
    gen[gen.report_date.dt.year <= 2014].net_generation_mwh.sum() 
    / gf[gf.report_date.dt.year <= 2014].net_generation_mwh.sum()
)

net_gen_diff_val_post2014 = (
    gen[gen.report_date.dt.year >2014].net_generation_mwh.sum() 
    / gf[gf.report_date.dt.year >2014].net_generation_mwh.sum()
)
logger.info(f"Missing net generation from gen table pre-2015: {net_gen_diff_val_pre2014:.02%}")
logger.info(f"Missing net generation from gen table post-2014: {net_gen_diff_val_post2014:.02%}")