In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import pudl
import sqlalchemy as sa
import logging
import sys
import pathlib

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('dark_background')

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])

### Explore Solutions

In [None]:
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine,freq='AS', end_date="2019-12-31")

In [None]:
def _stack_generators(pudl_out, idx_gens, cols_to_stack, cat_col='energy_source_code_num', stacked_col='fuel_type'):
    """
    Stack the generator table with a set of columns.
    
    Args:
        pudl_out
        idx_gens (iterable): list of columns. index to stack based on
        cols_to_stack (iterable): list of columns to stack
        cat_col (string): name of category column which will end up having the column names of cols_to_stack
        stacked_col (string): name of column which will end up with the stacked data from cols_to_stack

    Returns:
        pandas.DataFrame: a dataframe with these columns: idx_gens, cat_col, stacked_col
    """
    gens =pudl_out.gens_eia860()
    gens_stack_prep = (
        pd.DataFrame(gens.set_index(idx_gens)[cols_to_stack].stack(level=0))
        .reset_index()
        .rename(columns={'level_3': cat_col, 0: stacked_col})
    )
    # merge the stacked df back onto the gens table
    # we first drop the cols_to_stack so we don't duplicate data
    gens_stack = pd.merge(
        gens.drop(columns=cols_to_stack),
        gens_stack_prep,
        how='outer'
    )
    return gens_stack

def associate_gen_tables(pudl_out, idx_pm_fuel):
    idx_gens = ['plant_id_eia','generator_id', 'report_date']
    esc = ['energy_source_code_1', 'energy_source_code_2', 'energy_source_code_3',
           'energy_source_code_4', 'energy_source_code_5', 'energy_source_code_6', ]
    gens_asst = (
        pd.merge(
            _stack_generators(pudl_out, idx_gens=idx_gens, cols_to_stack=esc,
                              cat_col='energy_source_code_num', stacked_col='fuel_type'),
            pudl_out.gen_eia923(),
            on=idx_gens,
            how='outer')
        .merge(
            pudl_out.gf_eia923().groupby(by=idx_pm_fuel).sum(min_count=1).reset_index(),
            on=idx_pm_fuel,
            suffixes=('_gen', '_gf'),
            how='outer',
        )
    )
    return gens_asst

def _associate_unconnected_records(eia_generators_merged):
    """
    Associate unassocaited gen_fuel table records on 
    """
    idx_pm = ['plant_id_eia', 'prime_mover_code', 'energy_source_code_num', 'report_date',]
    # we're going to only associate these unconnected fuel records w/
    # the primary fuel so we don't have to deal w/ double counting
    connected_mask = eia_generators_merged.generator_id.notnull()
    eia_generators_connected = (
        eia_generators_merged[connected_mask]
    )
    eia_generators_unconnected = (
        eia_generators_merged[~connected_mask]
        .dropna(axis='columns', how='all')
        .rename(columns={'fuel_type': 'fuel_type_unconnected'})
        .assign(energy_source_code_num='fuel_type')
        .groupby(by=idx_pm).sum(min_count=1)
        .reset_index()
    )

    eia_generators = (
        pd.merge(
            eia_generators_connected,
            eia_generators_unconnected,
            on=idx_pm,
            suffixes=('', '_unconnected'),
            how='outer'
        )
        .assign(net_generation_mwh_gf=lambda x: x.net_generation_mwh_gf.fillna(0) + x.net_generation_mwh_gf_unconnected.fillna(0),
                fuel_consumed_mmbtu=lambda x: x.fuel_consumed_mmbtu.fillna(
                    0) + x.fuel_consumed_mmbtu_unconnected.fillna(0)
                )
    )
    return eia_generators

def _test_generator_output(eia_generators, pudl_out, idx_pm_fuel):
    # this is just for testing/debugging
    eia_generators = (
        pd.merge(
            eia_generators,
            eia_generators.groupby(by=idx_pm_fuel)
            [['net_generation_mwh']].sum().add_suffix('_test').reset_index(),
            on=idx_pm_fuel,
            how='outer'
        )
        .assign(net_generation_mwh_diff=lambda x:
                x.net_generation_mwh_gf.round() - x.net_generation_mwh_test.round()))
    no_cap_gen = eia_generators[
        (eia_generators.capacity_mw.isnull())
        & (eia_generators.net_generation_mwh_gen.isnull())
    ]
    if len(no_cap_gen) > 15:
        logger.info(
            f'Warning: Many records have no capacity or net gen ({len(no_cap_gen)})')
    gen_fuel = pudl_out.gf_eia923()
    gen = pudl_out.gen_eia923()
    # remove the junk/corrective plants
    fuel_net_gen = gen_fuel[gen_fuel.plant_id_eia !=
                            '99999'].net_generation_mwh.sum()
    fuel_consumed = gen_fuel[gen_fuel.plant_id_eia !=
                             '99999'].fuel_consumed_mmbtu.sum()
    logger.info(
        "gen v fuel table net gen diff:      "
        f"{(gen.net_generation_mwh.sum())/fuel_net_gen:.1%}")
    logger.info(
        "new v fuel table net gen diff:      "
        f"{(eia_generators.net_generation_mwh.sum())/fuel_net_gen:.1%}")
    logger.info(
        "new v fuel table fuel (mmbtu) diff: "
        f"{(eia_generators.fuel_consumed_mmbtu.sum())/fuel_consumed:.1%}")
    return eia_generators

def allocate_gen_fuel_by_gens(pudl_out):
    """
    
    Two main steps here:
     * associated gen_fuel data w/ generators
     * allocate gen_fuel data proportionally
    """
    idx_pm_fuel = ['plant_id_eia','prime_mover_code', 'fuel_type', 'report_date']
    gens_asst = associate_gen_tables(pudl_out, idx_pm_fuel)
    # get the total values for the merge group
    eia_generators = (
        pd.merge(
            gens_asst,
            gens_asst.groupby(by=idx_pm_fuel)
            [['capacity_mw', 'net_generation_mwh_gen']].sum(min_count=1)
            .add_suffix('_pm_fuel_total')
            .reset_index(),
            on=idx_pm_fuel,
        )
        .pipe(_associate_unconnected_records)
    )

    # do the allocating-ing!
    eia_generators = (
        eia_generators.assign(
            # we could condense these remaining cols into one... but let's keep it for debuging for now
            gen_ratio_net_gen=lambda x: x.net_generation_mwh_gen / \
            x.net_generation_mwh_gen_pm_fuel_total,
            gen_ratio_cap=lambda x: x.capacity_mw / x.capacity_mw_pm_fuel_total,
            gen_ratio=lambda x:
                np.where(x.gen_ratio_net_gen.notnull() | x.gen_ratio_net_gen != 0,
                         x.gen_ratio_net_gen, x.gen_ratio_cap),
            net_generation_mwh=lambda x: x.net_generation_mwh_gf * x.gen_ratio,
            fuel_consumed_mmbtu_gf=lambda x: x.fuel_consumed_mmbtu,
            fuel_consumed_mmbtu=lambda x: x.fuel_consumed_mmbtu * x.gen_ratio
        )
    )

    eia_generators = _test_generator_output(eia_generators, pudl_out, idx_pm_fuel)
    return eia_generators

In [None]:
eia_generators = allocate_gen_fuel_by_gens(pudl_out)

In [None]:
plt.plot(pudl_out.gf_eia923()
         .groupby(by='report_date',).sum().fuel_consumed_mmbtu,
         label='Fuel Table'
        )
plt.plot(eia_generators
         .groupby(by='report_date',).sum().fuel_consumed_mmbtu,
         label='Reassigned',
        )

plt.legend()
plt.ylabel("Total Fuel Consumed (mmBTU)")
plt.xlabel("year")
plt.title("Reassinged fuel consumed v fuel table", size=14)
plt.show()

In [None]:
plt.plot(pudl_out.gf_eia923()
         .groupby(by='report_date',dropna=False).sum().net_generation_mwh,
         label='Fuel Table Net Gen'
        )
plt.plot(eia_generators
         .groupby(by='report_date',dropna=False).sum().net_generation_mwh,
         label='Reassigned Net Gen',
        )

plt.legend()
plt.ylabel("Total Net Gen (MWh)")
plt.xlabel("year")
plt.title("Reassinged Net gen v fuel table", size=14)
plt.show()

### Explore Problem

In [None]:
gens = pudl_out.gens_eia860()
own = pudl_out.own_eia860()
gen = pudl_out.gen_eia923()
gf = pudl_out.gf_eia923()

In [None]:
net_gen_diff_val = gen.net_generation_mwh.sum() / gf.net_generation_mwh.sum()
logger.info(f"Missing net generation from gen table: {net_gen_diff_val:.02%}")

idx_cols_plant = ['plant_id_eia','report_date']
net_gen_diff = (
    pd.merge(
        gen.groupby(by=idx_cols_plant).agg({'net_generation_mwh':'sum'}),
        gf.groupby(by=idx_cols_plant)[['net_generation_mwh']].sum(),
        right_index=True,
        left_index=True,
        suffixes=('_gen','_gf'),
        how='outer'
    )
    .assign(
        net_gen_diff_num=lambda x: x.net_generation_mwh_gf.fillna(0) - x.net_generation_mwh_gen.fillna(0),
        net_gen_diff_rate=lambda x: x.net_generation_mwh_gen.fillna(0)/ x.net_generation_mwh_gf.fillna(0)
    )
    .merge(
        gens.groupby(by=idx_cols_plant)
        [['generator_id','prime_mover_code', 'fuel_type_code_pudl']]
        .nunique()
        .add_suffix('_count'),
        right_index=True,
        left_index=True,
        how='outer'
    )
    .reset_index()
)

diff_mask = (net_gen_diff.net_gen_diff_num.notnull()
    & (net_gen_diff.net_gen_diff_num != 0)
    & (~np.isclose(abs(net_gen_diff.net_gen_diff_rate), 1))
    & (~np.isclose(abs(net_gen_diff.net_gen_diff_rate), 0)))

logger.info(len(net_gen_diff[net_gen_diff.net_gen_diff_num < 0]))
logger.info(len(net_gen_diff[net_gen_diff.net_gen_diff_num >= 0]))
logger.info(len(net_gen_diff[net_gen_diff.net_gen_diff_num.isnull()]))

diff_w_many_pms = net_gen_diff[(net_gen_diff.prime_mover_code_count > 1) & diff_mask]
diff_w_one_pm = net_gen_diff[(net_gen_diff.prime_mover_code_count == 1) & diff_mask]
logger.info(
    f"% of records w/ gen v gen_fuel diff that have >1 prime mover: {len(diff_w_many_pms)/len(net_gen_diff[diff_mask]):.02%}")

In [None]:
plt.plot((net_gen_diff.groupby(by='report_date',dropna=False).sum().net_gen_diff_num
          /net_gen_diff.groupby(by='report_date',dropna=False).sum().net_generation_mwh_gf),
         label='evvvveerrything'
        )
plt.plot((net_gen_diff[(net_gen_diff.net_generation_mwh_gen.notnull())]
          .groupby(by='report_date',dropna=False).sum().net_gen_diff_num
          /net_gen_diff.groupby(by='report_date',dropna=False).sum().net_generation_mwh_gf
         ),
         label='only co-reported*'
        )

plt.legend()
plt.ylabel("Normalized Net Gen Difference")
plt.xlabel("*this does not include the plants that don't report to gen at all")
plt.title("Net gen from gen v gen fuel table", size=14)
plt.show()

In [None]:
for year in range(2009,2015):
    df = net_gen_diff[diff_mask & (net_gen_diff.report_date.dt.year == year)]
    plt.hist(
        df.net_gen_diff_rate,
        bins=75,
        range=(-.5,1.5),
        label=year, #stacked=True,
        #weights=abs(df.net_generation_mwh_gf)
    )
plt.title("Pre-2015 ratio of net gen from gen table vs gen fuel table", size=16)
plt.legend()
plt.xlabel("""ratio
note: All records that had nearly identical net generation has been removed""")
plt.show()

In [None]:
for year in range(2015,2019):
    df = net_gen_diff[diff_mask & (net_gen_diff.report_date.dt.year == year)]
    plt.hist(
        df.net_gen_diff_rate,
        bins=75,
        range=(-.5,1.5),
        label=year, #stacked=True,
        weights=abs(df.net_generation_mwh_gf)
    )
plt.title("Post-2014 ratio of net gen from gen table vs gen fuel table", size=16)
plt.legend()
plt.xlabel("""ratio
note: All records that had nearly identical net generation has been removed""")
plt.show()

In [None]:
idx_cols_gen = idx_cols_plant + ['generator_id']
gens_to_gen = pd.merge(
    gen[idx_cols_gen],
    gens[idx_cols_gen + ['prime_mover_code']],
    right_on=idx_cols_gen,
    left_on=idx_cols_gen,
    how='outer',
    indicator=True
)

pms_missing = (gens_to_gen[gens_to_gen._merge == 'right_only']
               .groupby(by=idx_cols_plant)[['prime_mover_code']].nunique())
pm_perc = len(pms_missing[pms_missing.prime_mover_code == 1])/len(pms_missing)

logger.info("Portion of plants for which the generators missing "
            f"from the gen table are all only one prime_move_code: {pm_perc:.02%}")

In [None]:
pre= "> 2014"
net_gen_diff_val_pre2014 = (
    gen[gen.report_date.dt.year <= 2014].net_generation_mwh.sum() 
    / gf[gf.report_date.dt.year <= 2014].net_generation_mwh.sum()
)

net_gen_diff_val_post2014 = (
    gen[gen.report_date.dt.year >2014].net_generation_mwh.sum() 
    / gf[gf.report_date.dt.year >2014].net_generation_mwh.sum()
)
logger.info(f"Missing net generation from gen table pre-2015: {net_gen_diff_val_pre2014:.02%}")
logger.info(f"Missing net generation from gen table post-2014: {net_gen_diff_val_post2014:.02%}")