In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import logging
import os

import numpy as np
import pandas as pd
import sqlalchemy as sa

import ferc1_eia_match
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder, SimilaritySearcher
import pudl

In [None]:
logger = logging.getLogger(__name__)

In [None]:
pudl_engine = sa.create_engine(pudl.workspace.setup.get_defaults()['pudl_db'])

# EIA and FERC 1 Inputs

Generate the FERC and EIA inputs or read in a pickled dataframe.

In [None]:
# set the report years for the experiment
start_report_year = 2018
end_report_year = 2020

In [None]:
inputs = ferc1_eia_match.inputs.InputManager(
    pudl_engine=pudl_engine, start_report_year=start_report_year, end_report_year=end_report_year)

In [None]:
eia_df = inputs.get_eia_input()

In [None]:
ferc_df = inputs.get_ferc_input()

Alternatively, read in pickled dataframes

In [None]:
# ferc_df = pd.read_pickle("../ferc_18_20.pkl")
# eia_df = pd.read_pickle("../plant_parts_18_20.pkl")

TODO: more feature engineering - use something besides fuel_type_code_pudl?

In [None]:
# columns in both the EIA and FERC1 dataframe that can be used for similarity search
# report_year is a manual blocking column
# later, some more complex feature engineering could be done
matching_cols = ["plant_name",
                 "utility_name",
                 "installation_year",
                 "construction_year",
                 "fuel_type_code_pudl",
                 "capacity_mw",
                 "report_year"]

In [None]:
ferc_left = ferc_df[matching_cols].reset_index()
eia_right = eia_df[matching_cols].reset_index()

# Blocking Experiments

Goal: Place all the correctly matching record pairs into the candidate set of matches, while making the size of the candidate set as small as possible. This allows for a more efficient subsequent linkage phase.

## TF-IDF + Equal Weight Aggregation
- Attribute vectorization: TF-IDF
- Tuple aggregation: equal weight concatenation
- Similarity search: L2 distance and cosine distance (separate)

In [None]:
col_embedding_dict = {
    "plant_name": "tfidf",
    "utility_name": "tfidf",
    "fuel_type_code_pudl": "tfidf",
    "installation_year": "min_max_scale",
    "construction_year": "min_max_scale",
    "capacity_mw": "min_max_scale"
}
embedder = DataframeEmbedder(left_df=ferc_left, right_df=eia_right, col_embedding_dict=col_embedding_dict, pudl_engine=pudl_engine)

In [None]:
embedder.embed_dataframes(blocking_col="report_year")

In [None]:
embedder.left_embedding_matrix.shape, embedder.right_embedding_matrix.shape

In [None]:
# should be a key for each value in blocking_col
len(embedder.left_blocks_dict), len(embedder.right_blocks_dict)

In [None]:
def run_search(embedder, k, metric="ip"):
    # an array of the k best right matches for each left record
    candidate_set = np.empty((len(embedder.left_df), k))
    for block_key in embedder.left_blocks_dict:
        print(f"Conducting search on the {block_key} block")
        left_idx = embedder.left_blocks_dict[block_key].to_numpy()
        right_idx = embedder.right_blocks_dict[block_key].to_numpy()
        sim_searcher = SimilaritySearcher(
            query_embedding_matrix=embedder.left_embedding_matrix[left_idx],
            menu_embedding_matrix=embedder.right_embedding_matrix[right_idx],
            base_index=right_idx
        )
        if metric == "l2":
            block_candidate_set = sim_searcher.l2_distance_search(k)
        else:
            block_candidate_set = sim_searcher.cosine_similarity_search(k)
        candidate_set[left_idx] = block_candidate_set
    return candidate_set

In [None]:
%%time
candidate_set = run_search(embedder, k=10, metric="l2")

In [None]:
# quick sanity check
ferc_left.iloc[0]

In [None]:
eia_right.iloc[candidate_set[0]]

In [None]:
eia_candidate_idx = np.unique(candidate_set)

In [None]:
eia_cands = eia_df.iloc[eia_candidate_idx]

In [None]:
len(eia_cands)/len(eia_right)

In [None]:
# eia_cands.to_pickle("eia_candidates_18_20_0.pkl")

# Measure Performance

How many of the training matches does a candidate set capture (recall@k) while minimizing the size of the candidate set?

In [None]:
# read in training data
train_df = pd.read_csv("inputs/ferc1_eia_train.csv")

In [None]:
train_df = train_df[(train_df.record_id_ferc1.isin(ferc_df.index)) & (train_df.record_id_eia.isin(eia_df.index))]

In [None]:
len(train_df)

In [None]:
train_df_with_idx = train_df.merge(ferc_left.reset_index(names="ferc_index")[["record_id_ferc1", "ferc_index"]], how="inner", on="record_id_ferc1")
train_df_with_idx = train_df_with_idx.merge(eia_right.reset_index(names="eia_index")[["record_id_eia", "eia_index"]], how="inner", on="record_id_eia")
ferc_idx = train_df_with_idx.ferc_index
eia_idx = train_df_with_idx.eia_index
train_df_with_idx

Do all of the EIA records in the training data show up in the candidate set?

In [None]:
eia_cands_idx_in_train = np.unique(candidate_set[ferc_idx])
len(set(eia_idx) & set(eia_cands_idx_in_train))/len(set(eia_idx))

What percentage of FERC to EIA pairs show up in the candidate set?

In [None]:
# how many of the EIA record pair indices show up in the corresponding row of the candidate set
pair_is_correct = np.in1d(eia_idx, candidate_set[ferc_idx])
n_correct_pairs = np.sum(pair_is_correct)
n_correct_pairs

In [None]:
n_correct_pairs/len(ferc_idx)

What type of records are we not getting correct?

Problems/Fixes:
- Often they have one or more column that are null.
- There are clues for FTCP in the name - "southwestern diesel", try to impute in that way
  - Easy idea is to search for a FTCP in the name - "moline conventional hydro (4 units @ 900kw each)"
- No white space around special characters - "south carolina electric&gas company"
- If one of `installation_year` or `construction_year` is missing, fill in with the other? Makes more sense than imputing from an average
- strip apostrophes? "queen's creek"
- Need to take into account 

In [None]:
wrong_idx = np.where(~pair_is_correct)[0]

In [None]:
len(wrong_idx)

In [None]:
i = 20

In [None]:
train_rec = train_df_with_idx.iloc[wrong_idx[i]]

In [None]:
train_rec.notes

In [None]:
ferc_left.iloc[[train_rec.ferc_index]]

In [None]:
eia_right.iloc[[train_rec.eia_index]]

In [None]:
eia_cands = candidate_set[train_rec.ferc_index]
eia_right.iloc[eia_cands]