# Validate & Integrate FERC1-EIA Manual Matches into the Training Data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Standard libraries
import logging
from pathlib import Path
import os
import sys

# 3rd party libraries
import pandas as pd
import sqlalchemy as sa
import importlib

# Local libraries
import pudl
from pudl.analysis.ferc1_eia_train import *

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()

ferc1_db_engine = sa.create_engine(pudl_settings['ferc1_db'])
ferc1_xbrl_engine = sa.create_engine(pudl_settings["ferc1_xbrl_db"])
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine, freq='AS', fill_net_gen=True)

In [None]:
# Load useful tables
ppe = pudl_out.plant_parts_eia().reset_index()
utils_eia860 = pudl_out.utils_eia860()
plants_all_ferc1 = pudl_out.plants_all_ferc1()
ferc1_eia = pudl_out.ferc1_eia()

## 1. Manual Mapping 
Use the Manual Mapping Spreadsheet and the Plant Parts EIA table (PPE) to make check AI matches and add your own. It's helpful to go plant by plant and read the [Override Instructions](https://docs.google.com/document/d/1nJfmUtbSN-RT5U2Z3rJKfOIhWsRFUPNxs9NKTes0SRA/edit#) to learn how to begin fixing/verifying the FERC-EIA connections.


Double check `plant_id_pudl` by copying the `record_id_ferc1` value from the manual mapping spreadsheet into the code below. You can then use plant_id_pudl to sort the PPE.

In [None]:
record_id_ferc1 = "f1_steam_2005_12_145_1_4"  #record you want to test
plants_all_ferc1[plants_all_ferc1["record_id"]==record_id_ferc1].plant_id_pudl

While looking through the PPE table, you might find possible matches that come from a different utility. Use the `utils_eia860` table to find that utility's name and location to see if it's a reasonable match.

In [None]:
utility_id_eia = 15466  #utiltiy you want to test
utils_eia860[utils_eia860["utility_id_eia"]==utility_id_eia]

Here's how to navigate the PPE. Add, comment, or uncomment these filters as necessary to find what you need. We recommend keeping `true_gran=TRUE` and `ownership_dupe=False` for the most streamline mapping experience.

In [None]:
ppe[
    (ppe["plant_id_pudl"]==659)
    #(ppe["plant_name_eia"].str.contains("Pleasant Va"))
    #(ppe["utility_id_eia"]==13781)
    #& (ppe["report_year"]==2015)
    #& (ppe["capacity_mw"]<10)
    & (ppe["true_gran"]==True)
    & (ppe["ownership_dupe"]==False)
][[
    "record_id_eia", 
    "plant_id_eia", 
    "true_gran", 
    "report_year", 
    "technology_description", 
    "utility_id_eia", 
    "capacity_mw", 
    "net_generation_mwh", 
    "installation_year", 
    "plant_name_eia", 
    "plant_name_ppe"
]].sort_values("capacity_mw", ascending=False)

## 2. Validate Manual Matches
Once you've finished checking the maps, make sure everything you want to validate is set to `verified=TRUE`. Then, move the file into the `devtools/ferc1-eia-glue/training_data/add_to_training` directory and run the following functions.

In [None]:
current_training_df = pd.read_csv(
    importlib.resources.path("pudl.package_data.glue", "ferc1_eia_train.csv")
)
path_to_overrides = "./add_to_training/"
override_files = [
    file for file in os.listdir(path_to_overrides) 
    if file.endswith(".xlsx") 
    and not file.startswith("~$")
]

Validate files in the `add_to_training` directory where `validated=TRUE`

In [None]:
for file in override_files:
    
    print(f"VALIDATING {file} ************** ")
    file_df = pd.read_excel(path_to_overrides + file)
    
    validate_override_fixes(
        validated_connections=file_df,
        utils_eia860=utils_eia860,
        ppe=ppe,
        ferc1_eia=ferc1_eia,
        training_data=current_training_df,
        expect_override_overrides=True,
        allow_mismatched_utilities=True
    )
    
    print(" ")

## 3. Add Manual Matches to Training Data
When you've finished editing the `<UTILITY>_fix_FERC-EIA_overrides.xlsx` and want to add your changes to the official override csv, move your file to the directory called `add_to_training` and then run the following function. 

**Note:** If you have changed or marked TRUE any records that have already been overridden and included in the training data, you will want to set `expect_override_overrides = True`. Otherwise, the function will check to see if you have accidentally tampered with values that have already been matched.

In [None]:
validate_and_add_to_training(
    utils_eia860=utils_eia860,
    ppe=ppe,
    ferc1_eia=ferc1_eia,
    expect_override_overrides=True, 
    allow_mismatched_utilities=True,
    input_dir_path="./add_to_training/"
)