# FERC <> EIA Granular Connections

Notes on the type of problem we are trying to solve:
- A classification problem
    - A Multi-Class Classification problem*
- A deterministic problem
- A record linkage problem

Right now, we are using the recordlinkage package. We're using logistic regression classifier because it fits all of the above.

To consider:
- Maybe we want to run the records with fuel cost data through a different matching model...

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import pudl
import pudl.constants as pc
import pudl.extract.ferc1
import sqlalchemy as sa
import logging
import sys
import copy
from copy import deepcopy
import scipy
import statistics
import yaml
import pathlib

import recordlinkage as rl
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

In [None]:
import pudl.helpers

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
sys.path.append("../")
from pudl.output.ferc1 import *
from pudl_rmi.connect_ferc1_to_eia import *
from pudl_rmi.make_plant_parts_eia import *
import pudl_rmi.connect_ferc1_to_eia
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])
pd.options.display.max_columns = None

### Pull in data (EIA, FERC and training)

In [None]:
file_path_training = pathlib.Path().cwd().parent /'inputs'/'train_ferc1_to_eia.csv'
file_path_mul = pathlib.Path().cwd().parent /'outputs' /'master_unit_list.pkl.gz'
file_path_deprish = pathlib.Path().cwd().parent/'inputs'/'depreciation_rmi.xlsx'
file_path_deprish_eia = pathlib.Path().cwd().parent/'outputs' / 'deprish_to_eia.pkl.gz'
file_path_ferc1_eia = pathlib.Path().cwd().parent/'outputs' / 'ferc1_to_eia.pkl.gz'
file_path_deprish_ferc1 = pathlib.Path().cwd().parent/'outputs' / 'deprish_ferc1.pkl.gz'

# pudl output object for ferc data
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine,freq='AS',
    fill_fuel_cost=True,
    roll_fuel_cost=True,
    fill_net_gen=False
)

In [None]:
rmi_out = pudl_rmi.coordinate.Output(
    pudl_out=pudl_out,
    file_path_mul=file_path_mul,
    file_path_deprish=file_path_deprish,
    file_path_deprish_eia=file_path_deprish_eia,
    file_path_training=file_path_training,
    file_path_ferc1_eia=file_path_ferc1_eia,
    file_path_deprish_ferc1=file_path_deprish_ferc1,
)

In [None]:
inputs = InputManager(file_path_training=file_path_training, pudl_out=pudl_out, plant_parts_df=rmi_out.get_plant_part_list())
features_all = (Features(feature_type='all', inputs=inputs)
                .get_features(clobber=False))
features_train = (Features(feature_type='training', inputs=inputs)
                  .get_features(clobber=False))
tuner = ModelTuner(features_train, inputs.get_train_index(), n_splits=10)

matcher = MatchManager(best=tuner.get_best_fit_model(), inputs=inputs)
matches_best = matcher.get_best_matches(features_train, features_all)

connects_ferc1_eia = prettyify_best_matches(
    matches_best,
    plant_parts_true_df=inputs.plant_parts_true_df,
    steam_df=inputs.steam_df,
)

In [None]:
file_path_ferc1_eia = pathlib.Path().cwd().parent / 'outputs' /'ferc1_to_eia.pkl.gz'
connects_ferc1_eia.to_pickle(file_path_ferc1_eia)

### Dominion Exploration

In [None]:
steam = pudl_out.plants_steam_ferc1()
ppe = pudl_out.plant_parts_eia()

In [None]:
#pudl 349
#ferc 186
dom_years = [2015,2016,2017,2018,2019, 2020]
dom_plants = [
   943,  1052,  1169,  1053,    74,   111,   715,   339,   430,   106,   494,
   574,   661,   429,   424,   235,   148,   118,  1201,   489,   313,  1148,
    15,   549,   517,    45,   365,   177,   454,   230,   618,    37,   610,
  8454, 10701, 10784,  9457, 10616,  9448, 10496,  9913,  9956,  9955,   397,
 10856, 11108]
# [514, 9997, 15, 658, 147, 1049, 1050, 546, 37, 421, 426, 427, 940, 45, 176, 947, 311, 571, 452, 712, 73, 337, 9946, 9948, 607, 228, 486, 615, 105, 233, 362, 491, 110, 117, 1147]
util_col = 'utility_id_ferc1'
weight_col = 'capex_total'
idx_plant = ['plant_id_pudl', 'report_year']
idx_util = ['report_year', util_col]

dom_ferc_eia_cols = idx_plant + [util_col] + [
    'record_id_ferc1',
    'record_id_eia',
    'plant_name_new',
    'plant_part',
    'capex_total',
    'capex_annual_addt',
    'single_plant_part',
    f'{weight_col}_util',
    'weight',
    f'retirements_util_{data_col}_from_steam',
]

dom_ferc_eia = (
    connects_ferc1_eia[
        connects_ferc1_eia.plant_id_pudl.isin(dom_plants)
        # & (connects_ferc1_eia.utility_id_pudl == 349)
        # & (connects_ferc1_eia.utility_id_ferc1 == 186)
        & (connects_ferc1_eia.report_date.dt.year.isin(dom_years))
    ]
    .rename(columns={
        'net_generation_mwh_ferc1': 'net_generation_mwh',
        'capacity_mw_ferc1': 'capacity_mw',
    })
    .pipe(pudl.analysis.mcoe.calc_annual_capital_addts_ferc1)
    .set_index(['record_id_ferc1', 'report_year', util_col, 'record_id_eia','plant_name_new','plant_part', 'plant_id_pudl'])
    .filter(like='capex')
    .reset_index()
)

test = (
    dom_ferc_eia.assign(single_plant_part=1)
    .groupby(idx_plant)
    [['single_plant_part']].count()
    .assign(
        single_plant_part=lambda x: 
        np.where(x.single_plant_part == 1, True, False))
    .reset_index()
)

dom_ferc_eia = pd.merge(
    dom_ferc_eia, test,
    on=idx_plant,
    how='left'
    
)

dom_ferc_eia.loc[:, f'{weight_col}_util'] = dom_ferc_eia.groupby(idx_util)[[weight_col]].transform(sum)
dom_ferc_eia.loc[:, 'weight'] = dom_ferc_eia.loc[:, weight_col] / dom_ferc_eia.loc[:,f'{weight_col}_util']
dom_ferc_eia.loc[:, f'retirements_util_{data_col}_from_steam'] =  dom_ferc_eia[dom_ferc_eia['capex_annual_addt'] < 0 ].groupby(idx_util)[['capex_annual_addt']].transform(sum)
# dom_ferc_eia.to_csv('dominion.csv')

dom_deprish = pd.read_excel(
    pathlib.Path().cwd().parent /'inputs'/"Dominion Depreciation Plant Balances.xlsx",
    sheet_name="dom_only"
)

dom_squish = (
    pd.merge(
        dom_deprish, 
        dom_ferc_eia[dom_ferc_eia_cols],
        on=idx_plant + [util_col],
        how='outer'
    )
    .merge(
        pis_out,
        on=idx_util,
        how='left'
    )
    .sort_values(idx_plant)
    .assign(plant_name_deprish=lambda x: x.plant_name_deprish.fillna(x.plant_name_new))
)
dom_squish.to_csv('dom_squish.csv', index=False)

### Explore Classificaiton Model

In [None]:
features_train.describe().T

In [None]:
features_all.describe().T

In [None]:
matcher.best

In [None]:
weights = pd.DataFrame(
    data={'feature': features_all.columns,
          'weight': matcher.coefs
         })
weights

In [None]:
def plot_all_hist(all_df,results_df,murk_df, matches_best_df, range):
    if all_df is not None:
        plt.hist(all_df['score'], 
                 range=range,
                 bins=100,
                 color="pink",
                 label='all options'
                )
    if results_df is not None:
        plt.hist(results_df['score'], 
                 range=range,
                 bins=100,
                 color="purple",
                 label='all model matches'
                )
    if matches_best_df is not None:
        plt.hist(matches_best_df['score'], 
                 range=range,
                 bins=100,
                 color="turquoise",
                 label='winning options'
                )
    if murk_df is not None:
        plt.hist(murk_df['score'], 
                 range=range,
                 bins=100,
                 color="grey",
                 label='murky wins'
                )

    plt.title(f'weighted score of comparision features')
    plt.xlabel('weighted sum')
    plt.ylabel(None)
    plt.legend()
    plt.show()

In [None]:
plot_all_hist(matcher.calc_match_stats(features_all),
              matcher.matches_model,
              matcher.murk_df,
              matches_best,
              range=None)

In [None]:
wins = (matches_best.reset_index().merge(inputs.plant_parts_df.
              reset_index()[['record_id_eia','plant_part','capacity_mw']],
              on=['record_id_eia'],
              how='left',
              suffixes=('_feature','')
             ).
        groupby(['plant_part']).
        agg({'capacity_mw':sum,
             'score': 'count'}).
        assign(count_w=lambda x: x.capacity_mw * x.score,
               percent_w=lambda x: x.count_w/x.sum()['count_w'],
               percent=lambda x: x.score/x.sum()['score'],
              )
       )

wins.round(2)