# Depreciation to FERC 1 Connection

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import pudl
import pudl.constants as pc
import sqlalchemy as sa
import logging
import sys
import copy
import pathlib
import random
import warnings
from copy import deepcopy

sys.path.append("../")
import connect_deprish_to_eia
import make_plant_parts_eia
import connect_deprish_to_ferc1
import deprish

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

### Prepare the inputs

In [None]:
file_path_mul = pathlib.Path().cwd().parent / 'outputs'/ 'master_unit_list.pkl.gz'
file_path_steam_ferc1 = pathlib.Path().cwd().parent /'outputs' / 'steam_ferc1.pkl.gz'
file_path_ferc1_eia = pathlib.Path().cwd().parent / 'outputs' /'ferc1_to_eia.pkl.gz'
file_path_deprish_eia = pathlib.Path().cwd().parent / 'outputs' /'deprish_to_eia.pkl.gz'

In [None]:
%%time
inputs = connect_deprish_to_ferc1.InputsCompiler(
    file_path_mul=file_path_mul,
    file_path_steam_ferc1=file_path_steam_ferc1,
    file_path_ferc1_eia=file_path_ferc1_eia,
    file_path_deprish_eia=file_path_deprish_eia
)

### Generate the options and connections!

In [None]:
match_maker = connect_deprish_to_ferc1.MatchMaker(inputs)
matches_df = match_maker.match()

In [None]:
scaler = connect_deprish_to_ferc1.Scaler(match_maker)
scaled_df = scaler.scale()
# this should be true
len(scaled_df) == len(scaler.matches_df)

In [None]:
# we should figure out a cleaner/automatic way to do this....
# but for now I'm just enumerating the columns I want to keep
cols_to_keep = [
    "plant_part_deprish", "plant_part_ferc1", "record_id_eia_deprish", 
    "record_id_eia_ferc1", "plant_part_name", "plant_name_match", 
    "fraction_owned_deprish", "fraction_owned_ferc1", 
    "record_count_deprish", "record_count_ferc1", "plant_part_name_match", 
    "state", "utility_name_ferc1_deprish", "plant_id_pudl", 
    "utility_id_ferc1_deprish", "true_gran_name_match", "report_date", 
    "plant_name_new_deprish", "report_year_deprish", "ownership_deprish", 
    "plant_name_eia_deprish", "plant_id_eia_deprish", 
    "generator_id_deprish", "unit_id_pudl_deprish", 
    "prime_mover_code_deprish", "energy_source_code_1_deprish", 
    "technology_description_deprish", "ferc_acct_name_deprish", 
    "utility_id_eia_deprish", "utility_id_pudl", "true_gran_deprish", 
    "appro_part_label_deprish", "appro_record_id_eia_deprish", 
    "ownership_dupe_deprish", "total_fuel_cost_deprish", 
    "net_generation_mwh_deprish", "capacity_mw_deprish", 
    "record_id_ferc1", "utility_id_ferc1_ferc1", 
    "utility_name_ferc1_ferc1", "plant_id_ferc1", "plant_name_ferc1", 
    "asset_retirement_cost", "avg_num_employees", "capacity_factor", 
    "capacity_mw_ferc1", "capex_equipment", "capex_land", "capex_per_mw", 
    "capex_structures", "capex_total", "construction_type", 
    "construction_year", "installation_year", "net_generation_mwh_ferc1", 
    "not_water_limited_capacity_mw", "opex_allowances", "opex_boiler", 
    "opex_coolants", "opex_electric", "opex_engineering", "opex_fuel", 
    "fuel_cost_per_mwh", "opex_misc_power", "opex_misc_steam", 
    "opex_nonfuel_per_mwh", "opex_operations", "opex_per_mwh", 
    "opex_plants", "opex_production_total", "opex_rents", "opex_steam", 
    "opex_steam_other", "opex_structures", "opex_transfer", "peak_demand_mw", 
    "plant_capability_mw", "plant_hours_connected_while_generating", 
    "plant_type", "water_limited_capacity_mw", "total_fuel_cost_ferc1", 
    "total_mmbtu", "fuel_type_code_pudl", "fuel_cost_per_mmbtu", 
    "heat_rate_mmbtu_mwh", "plant_id_report_year", 
    "plant_id_report_year_util_id", "opex_nonfuel", "opex_nonfuel_own_frac", 
    "net_generation_mwh_ferc1_own_frac"]

In [None]:
# we want to grab the depreciation data so the scaled output can have
# ALL OF THE DATA
file_path_deprish = pathlib.Path().cwd().parent/'depreciation_rmi.xlsx'
sheet_name_deprish=0
deprish_df = (
    deprish.Transformer(
        deprish.Extractor(file_path=file_path_deprish,
                          sheet_name=sheet_name_deprish).execute())
    .execute()
    .dropna(subset=connect_deprish_to_eia.RESTRICT_MATCH_COLS)
)

matches_df = pd.merge(
    deprish_df,
    scaler.matches_df[cols_to_keep],
    how='outer'
)

In [None]:
file_path_deprish_ferc = pathlib.Path().cwd().parent / 'outputs' /'deprish_to_ferc.csv.gz'
matches_df.to_csv(file_path_deprish_ferc)

In [None]:
# testing the scaling
same_smol = scaler.split_ferc1_data_cols()
same_beeg = scaler.agg_ferc_data_cols()

first_cols = ['plant_part_deprish', 'plant_part_ferc1',
              'record_id_eia_deprish', 'record_id_eia_ferc1',
              'plant_name', 'plant_name_match', 'plant_name_ferc1', 'plant_name_new_ferc1',
              'fraction_owned_deprish', 'fraction_owned_ferc1',
              'record_count_deprish', 'record_count_ferc1',
              'record_count_matches_deprish'
              ]
def reorder_cols(df, first_cols):
    # reorder cols so they are easier to see, maybe remove later
    df = df[
        [x for x in first_cols if x in df.columns] 
        + [x for x in df.columns if x not in first_cols]]
    return df
matches_df = reorder_cols(matches_df, first_cols)
same_smol = reorder_cols(same_smol, first_cols)
same_beeg = reorder_cols(same_beeg, first_cols)