In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import logging
import os
import time

import numpy as np
import pandas as pd
import sqlalchemy as sa

import ferc1_eia_match
from ferc1_eia_match import metrics, config
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder, SimilaritySearcher
import pudl

In [None]:
logger = logging.getLogger(__name__)

In [None]:
pudl_engine = sa.create_engine(pudl.workspace.setup.get_defaults()['pudl_db'])

# EIA and FERC 1 Inputs

Generate the FERC and EIA inputs or read in a pickled dataframe.

In [None]:
# set configuration for model
model_config = {
    "inputs": {
        "start_year": 2019,
        "end_year": 2020,
    },
    "embedding": {
        "embedding_map": {
            "plant_name": {"embedding_type": "tfidf_vectorize"},
            "utility_name": {"embedding_type": "tfidf_vectorize"},
            "fuel_type_code_pudl": {"embedding_type": "tfidf_vectorize"},
            "installation_year": {"embedding_type": "min_max_scale"},
            "construction_year": {"embedding_type": "min_max_scale"},
            "capacity_mw": {"embedding_type": "min_max_scale"},
        },
        "matching_cols": [
            "plant_name",
            "utility_name",
            "installation_year",
            "construction_year",
            "fuel_type_code_pudl",
            "capacity_mw",
            "report_year"
        ],
        "blocking_col": "report_year"
    },
    "similarity_search": {"distance_metric": "l2_distance_search"},
}
model_config = config.Model(**model_config)

In [None]:
inputs = ferc1_eia_match.inputs.InputManager(
    pudl_engine=pudl_engine,
    start_report_year=model_config.inputs.start_report_year,
    end_report_year=model_config.inputs.end_report_year)

In [None]:
ferc_df = inputs.get_ferc_input()

In [None]:
eia_df = inputs.get_eia_input()

Alternatively, read in pickled dataframes

In [None]:
# ferc_df = pd.read_pickle("../ferc1_18_20.pkl")
# eia_df = pd.read_pickle("../plant_parts_eia.pkl")

TODO: more feature engineering - use something besides fuel_type_code_pudl?

In [None]:
# columns in both the EIA and FERC1 dataframe that can be used for similarity search
# report_year is a manual blocking column
# later, some more complex feature engineering could be done
matching_cols = ["plant_name",
                 "utility_name",
                 "installation_year",
                 "construction_year",
                 "fuel_type_code_pudl",
                 "capacity_mw",
                 "report_year"]

In [None]:
ferc_left = ferc_df[matching_cols].reset_index()
eia_right = eia_df[matching_cols].reset_index()

# Blocking Experiments

Goal: Place all the correctly matching record pairs into the candidate set of matches, while making the size of the candidate set as small as possible. This allows for a more efficient subsequent linkage phase.

## TF-IDF + Equal Weight Aggregation
- Attribute vectorization: TF-IDF
- Tuple aggregation: equal weight concatenation
- Similarity search: L2 distance and cosine distance (separate)

In [None]:
col_embedding_dict = {
    "plant_name": ["tfidf_vectorize"],
    "utility_name": ["tfidf_vectorize"],
    "fuel_type_code_pudl": ["tfidf_vectorize"],
    "installation_year": ["min_max_scale"],
    "construction_year": ["min_max_scale"],
    "capacity_mw": ["min_max_scale"]
}
embedder = DataframeEmbedder(left_df=ferc_left,
                             right_df=eia_right,
                             col_embedding_dict=col_embedding_dict)

In [None]:
embedder.embed_dataframes(blocking_col="report_year")

In [None]:
embedder.left_embedding_matrix.shape, embedder.right_embedding_matrix.shape

In [None]:
# should be a key for each value in blocking_col
len(embedder.left_blocks_dict), len(embedder.right_blocks_dict)

In [None]:
searcher = SimilaritySearcher(query_embedding_matrix=embedder.left_embedding_matrix,
                   menu_embedding_matrix=embedder.right_embedding_matrix,
                   query_blocks_dict=embedder.left_blocks_dict,
                   menu_blocks_dict=embedder.right_blocks_dict)

In [None]:
# read in training data
train_df = pd.read_csv("inputs/ferc1_eia_train.csv")
ks = [5, 10, 15, 20, 25, 30, 40, 50]
metrics.blocking.measure_blocking(searcher.run_candidate_pair_search, ks, train_df, ferc_left, eia_right, model_config)

# Cross Validation

The time goes down at k=10 because FAISS switches similarity search implementations to a multithreaded version.

In [None]:
ks = [5, 10, 15, 20, 25, 30, 40, 50]
for k in ks:
    print(f"k={k}")
    t0 = time.time()
    candidate_set = run_search(embedder, k=k, metric="l2")
    print(time.time() - t0)
    pair_is_correct = np.in1d(eia_train_idx, candidate_set[ferc_train_idx])
    n_correct_pairs = np.sum(pair_is_correct)
    print(n_correct_pairs/len(train_df_with_idx))

What type of records are we not getting correct?

Problems/Fixes:
- Often they have one or more column that are null.
- There are clues for FTCP in the name - "southwestern diesel", try to impute in that way
  - Easy idea is to search for a FTCP in the name - "moline conventional hydro (4 units @ 900kw each)"
- No white space around special characters - "south carolina electric&gas company"
- If one of `installation_year` or `construction_year` is missing, fill in with the other? Makes more sense than imputing from an average
- strip apostrophes? "queen's creek" 

In [None]:
wrong_idx = np.where(~pair_is_correct)[0]

In [None]:
len(wrong_idx)

In [None]:
i = 20

In [None]:
train_rec = train_df_with_idx.iloc[wrong_idx[i]]

In [None]:
train_rec.notes

In [None]:
ferc_left.iloc[[train_rec.ferc_index]]

In [None]:
eia_right.iloc[[train_rec.eia_index]]

In [None]:
eia_cands = candidate_set[train_rec.ferc_index]
eia_right.iloc[eia_cands]