# FERC <> EIA Granular Connections

Notes on the type of problem we are trying to solve:
- A classification problem
    - A Multi-Class Classification problem*
- A deterministic problem
- A record linkage problem

Right now, we are using the recordlinkage package. We're using logistic regression classifier because it fits all of the above.

To consider:
- Maybe we want to run the records with fuel cost data through a different matching model...

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import pudl
import sqlalchemy as sa
import logging
import sys

import pudl_rmi
from pudl_rmi.connect_ferc1_to_eia import *

import recordlinkage as rl
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:

pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])
pd.options.display.max_columns = None

## Make outputs via rmi_out

In [None]:
# pudl output object for ferc data
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine,freq='AS',
    fill_fuel_cost=True,
    roll_fuel_cost=True,
    fill_net_gen=True
)

In [None]:
rmi_out = pudl_rmi.coordinate.Output(pudl_out)

In [None]:
connects_ferc1_eia = rmi_out.ferc1_to_eia(clobber=True)

## Make outputs directly

In [None]:
inputs = InputManager(pudl_rmi.TRAIN_FERC1_EIA_CSV, pudl_out, plant_parts_eia)
features_all = Features(feature_type="all", inputs=inputs).get_features(
    clobber=False
)
features_train = Features(feature_type="training", inputs=inputs).get_features(
    clobber=False
)
tuner = ModelTuner(features_train, inputs.get_train_index(), n_splits=10)

matcher = MatchManager(best=tuner.get_best_fit_model(), inputs=inputs)
matches_best = matcher.get_best_matches(features_train, features_all)
connects_ferc1_eia = prettyify_best_matches(
    matches_best,
    train_df=inputs.train_df,
    plant_parts_true_df=inputs.plant_parts_true_df,
    plants_ferc1_df=inputs.plants_ferc1_df,
)
# add capex (this should be moved into pudl_out.plants_steam_ferc1)
connects_ferc1_eia = calc_annual_capital_additions_ferc1(connects_ferc1_eia)

### Explore Classificaiton Model

In [None]:
features_train.describe().T

In [None]:
features_all.describe().T

In [None]:
matcher.best

In [None]:
weights = pd.DataFrame(
    data={'feature': features_all.columns,
          'weight': matcher.coefs
         })
weights

In [None]:
def plot_all_hist(all_df,results_df,murk_df, matches_best_df, range):
    if all_df is not None:
        plt.hist(all_df['score'], 
                 range=range,
                 bins=100,
                 color="pink",
                 label='all options'
                )
    if results_df is not None:
        plt.hist(results_df['score'], 
                 range=range,
                 bins=100,
                 color="purple",
                 label='all model matches'
                )
    if matches_best_df is not None:
        plt.hist(matches_best_df['score'], 
                 range=range,
                 bins=100,
                 color="turquoise",
                 label='winning options'
                )
    if murk_df is not None:
        plt.hist(murk_df['score'], 
                 range=range,
                 bins=100,
                 color="grey",
                 label='murky wins'
                )

    plt.title(f'weighted score of comparision features')
    plt.xlabel('weighted sum')
    plt.ylabel(None)
    plt.legend()
    plt.show()

In [None]:
plot_all_hist(matcher.calc_match_stats(features_all),
              matcher.matches_model,
              matcher.murk_df,
              matches_best,
              range=None)

In [None]:
wins = (matches_best.reset_index().merge(inputs.plant_parts_df.
              reset_index()[['record_id_eia','plant_part','capacity_mw']],
              on=['record_id_eia'],
              how='left',
              suffixes=('_feature','')
             ).
        groupby(['plant_part']).
        agg({'capacity_mw':sum,
             'score': 'count'}).
        assign(count_w=lambda x: x.capacity_mw * x.score,
               percent_w=lambda x: x.count_w/x.sum()['count_w'],
               percent=lambda x: x.score/x.sum()['score'],
              )
       )

wins.round(2)