# Validate & Integrate FERC1-EIA Manual Matches into the Training Data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Standard libraries
import logging
from pathlib import Path
import os
import sys

# 3rd party libraries
import pandas as pd
import sqlalchemy as sa
import importlib

# Local libraries
import pudl
from pudl.workspace.setup import PudlPaths
from pudl.analysis.ferc1_eia_train import *

In [None]:
pudl_engine = sa.create_engine(PudlPaths().pudl_db)
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine, freq='AS', fill_net_gen=True)

In [None]:
# Load useful tables
ppe = pudl_out.plant_parts_eia().reset_index()
utils_eia860 = pudl_out.utils_eia860()
plants_all_ferc1 = pudl_out.plants_all_ferc1()
ferc1_eia = pudl_out.ferc1_eia()

## 1. Manual Mapping 
Use the Manual Mapping Spreadsheet and the Plant Parts EIA table (PPE) to make check AI matches and add your own. It's helpful to go plant by plant and read the [Override Instructions](https://docs.google.com/document/d/1nJfmUtbSN-RT5U2Z3rJKfOIhWsRFUPNxs9NKTes0SRA/edit#) to learn how to begin fixing/verifying the FERC-EIA connections.


Double check `plant_id_pudl` by copying the `record_id_ferc1` value from the manual mapping spreadsheet into the code below. You can then use plant_id_pudl to sort the PPE.

In [None]:
record_id_ferc1 = "f1_steam_2005_12_145_1_4"  #record you want to test
plants_all_ferc1[plants_all_ferc1["record_id"]==record_id_ferc1].plant_id_pudl

While looking through the PPE table, you might find possible matches that come from a different utility. Use the `utils_eia860` table to find that utility's name and location to see if it's a reasonable match.

In [None]:
utility_id_eia = 15466  #utiltiy you want to test
utils_eia860[utils_eia860["utility_id_eia"]==utility_id_eia]

Here's how to navigate the PPE. Add, comment, or uncomment these filters as necessary to find what you need. We recommend keeping `true_gran=TRUE` and `ownership_dupe=False` for the most streamline mapping experience.

In [None]:
ppe[
    (ppe["plant_id_eia"]==1393)
    #(ppe["plant_name_eia"].str.contains("Pleasant Va"))
    #(ppe["utility_id_eia"]==13781)
    #& (ppe["report_year"]==2020)
    #& (ppe["capacity_mw"]<10)
    & (ppe["true_gran"]==True)
    & (ppe["ownership_dupe"]==False)
][[
    "record_id_eia", 
    "plant_id_eia", 
    "true_gran", 
    "report_year", 
    "technology_description", 
    "utility_id_eia", 
    "capacity_mw", 
    "net_generation_mwh", 
    "installation_year", 
    "plant_name_eia", 
    "plant_name_ppe"
]].sort_values("capacity_mw", ascending=False)

## 2. Validate Manual Matches
Once you've finished checking the maps, make sure everything you want to validate is set to `verified=TRUE`. Then, move the file into the `devtools/ferc1-eia-glue/training_data/add_to_training` directory and run the following functions.

In [None]:
current_training_df = pd.read_csv(
    importlib.resources.files("pudl.package_data.glue").joinpath("ferc1_eia_train.csv")
)
path_to_overrides = "./add_to_training/"
override_files = [
    file for file in os.listdir(path_to_overrides) 
    if file.endswith(".xlsx") 
    and not file.startswith("~$")
]

Validate files in the `add_to_training` directory where `validated=TRUE`

In [None]:
for file in override_files:
    
    print(f"VALIDATING {file} ************** ")
    file_df = pd.read_excel(path_to_overrides + file)
    
    validate_override_fixes(
        validated_connections=file_df,
        ppe=ppe,
        ferc1_eia=ferc1_eia,
        training_data=current_training_df,
        expect_override_overrides=True,
        allow_mismatched_utilities=True
    )
    
    print(" ")

Next, handle 1:m matches in the FERC-EIA manual mapping. Begin by making a dataframe of only 1:m matches, melting it, and running it through the validation to check that all row values are valid.

In [None]:
for file in override_files:
    
    print(f"VALIDATING {file} ************** ")
    file_df = pd.read_excel(path_to_overrides + file)
    multi_match_cols = [f"record_id_eia_override_{i}" for i in range(2,4)]
    match_cols = [col for col in file_df.columns if 'record_id_eia_override_' in col]
    id_cols = [col for col in file_df.columns if col not in match_cols]

    multimatch_df = file_df[file_df[multi_match_cols].notnull().any(axis=1)]
    multimatch_df = multimatch_df.melt(id_vars = id_cols, var_name = 'match_number', value_name = 'record_id_eia_override').dropna(subset=['record_id_eia_override'])
    multimatch_df = multimatch_df.rename(columns={'record_id_eia_override':'record_id_eia_override_1'})

    validate_override_fixes(
        validated_connections=multimatch_df,
        ppe=ppe,
        ferc1_eia=ferc1_eia,
        training_data=current_training_df,
        expect_override_overrides=True,
        allow_mismatched_utilities=True
    )

## 3. Add Manual Matches to Training Data
When you've finished editing the `<UTILITY>_fix_FERC-EIA_overrides.xlsx` and want to add your changes to the official override csv, move your file to the directory called `add_to_training` and then run the following function. 

**Note:** If you have changed or marked TRUE any records that have already been overridden and included in the training data, you will want to set `expect_override_overrides = True`. Otherwise, the function will check to see if you have accidentally tampered with values that have already been matched.

If you wish to ignore 1:m matches, set `one_to_many = False`. The training data csv will only include primary matches. If `one_to_many = True`, the multiple EIA IDs that match a FERC ID will be assigned a fabricated PPL record to use for matching.

In [None]:
validate_and_add_to_training(
    utils_eia860=utils_eia860,
    ppe=ppe,
    ferc1_eia=ferc1_eia,
    expect_override_overrides=True,
    allow_mismatched_utilities=True,
    input_dir_path="./add_to_training/",
    one_to_many=True,
)

Some of these 1:m matches may not be able to be reduced to a single 'faked' plant part, such as plant parts that include operating & retired parts or parts from more than one EIA plant ID. To finish integration, re-run the plant part list generation. If any parts produce an error, move these parts out of the one_to_many csv and into the null csv, and re-run.

In [None]:
# Get paths to CSVs.
from importlib import resources
one_to_many = path_to_one_to_many=resources.files("pudl.package_data.glue").joinpath(
                "ferc1_eia_one_to_many.csv",
            )
nulls = path_to_one_to_many=resources.files("pudl.package_data.glue").joinpath(
                "ferc1_eia_null.csv",
            )

In [None]:
# Re-run plant parts list with new faked records. This will take ~15 min. Time for a snack break.
new_ppe = pudl_out.plant_parts_eia(update = True).reset_index()

In [None]:
# Paste any problematic records provided by the error message here.
nulls_to_add = ['f1_steam_2013_12_27_0_2', 'f1_steam_2014_12_27_0_2', 'f1_steam_2007_12_166_2_5', 'f1_steam_2008_12_166_2_5', 'f1_steam_2009_12_166_2_5', 'f1_steam_2010_12_166_2_5', 'f1_steam_2014_12_57_1_4', 'f1_steam_2015_12_57_1_4', 'f1_steam_2005_12_210_0_2', 'f1_steam_2006_12_210_0_2', 'f1_steam_2007_12_210_0_2', 'f1_steam_2008_12_210_0_2', 'f1_steam_2009_12_210_0_2', 'f1_steam_2010_12_210_0_2', 'f1_hydro_2005_12_70_1_2', 'f1_hydro_2006_12_70_1_2', 'f1_hydro_2007_12_70_1_2', 'f1_hydro_2008_12_70_1_2', 'f1_hydro_2009_12_70_1_2', 'f1_hydro_2010_12_70_1_2', 'f1_hydro_2011_12_70_1_2', 'f1_hydro_2012_12_70_1_2', 'f1_hydro_2013_12_70_1_2', 'f1_hydro_2014_12_70_1_2', 'f1_hydro_2015_12_70_1_2', 'f1_hydro_2016_12_70_1_2', 'f1_hydro_2017_12_70_1_2', 'f1_hydro_2018_12_70_1_2', 'f1_hydro_2019_12_70_1_2', 'f1_hydro_2005_12_70_2_1', 'f1_hydro_2006_12_70_2_1', 'f1_hydro_2007_12_70_2_1', 'f1_hydro_2008_12_70_2_1', 'f1_hydro_2009_12_70_2_1', 'f1_hydro_2010_12_70_2_1', 'f1_hydro_2011_12_70_2_1', 'f1_hydro_2012_12_70_2_1', 'f1_hydro_2013_12_70_2_1', 'f1_hydro_2014_12_70_2_1', 'f1_hydro_2015_12_70_2_1', 'f1_hydro_2016_12_70_2_1', 'f1_hydro_2017_12_70_2_1', 'f1_hydro_2018_12_70_2_1', 'f1_hydro_2019_12_70_2_1', 'f1_steam_2014_12_8_0_4', 'f1_steam_2015_12_8_0_4', 'f1_steam_2013_12_87_0_5', 'f1_steam_2014_12_87_0_5', 'f1_steam_2016_12_454_2_4', 'f1_steam_2017_12_454_2_3', 'f1_steam_2018_12_454_2_3', 'f1_steam_2019_12_454_2_3', 'f1_steam_2017_12_454_3_1', 'f1_steam_2018_12_454_3_1']

nulls_to_add = pd.DataFrame(nulls_to_add, columns = ['record_id_ferc1'])
nulls_to_add['Notes'] = "More than 1 1:m plant part." # Add notes column.

In [None]:
# Add these records to the nulls csv.
with resources.as_file(nulls) as override_source:
    nulls_csv = pd.read_csv(override_source)

# If nulls not already in csv, add them
nulls_new = nulls_to_add[~nulls_to_add.record_id_ferc1.isin(nulls_csv.record_id_ferc1)]
nulls_csv = pd.concat([nulls_csv,nulls_new])
nulls_csv.to_csv(nulls, index = False) # Save.

In [None]:
# And remove them from the one_to_many csv.
with resources.as_file(one_to_many) as override_source:
    one_to_many_csv = pd.read_csv(override_source)

one_to_many_csv = one_to_many_csv[~one_to_many_csv.record_id_ferc1.isin(nulls_csv.record_id_ferc1)]
one_to_many_csv.to_csv(one_to_many, index = False) # Save.

In [None]:
# Run again.
new_ppe = pudl_out.plant_parts_eia(update = True).reset_index()