In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import logging

import numpy as np
import pandas as pd
import sqlalchemy as sa

import ferc1_eia_match
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder, SimilaritySearcher
import pudl

In [None]:
logger = logging.getLogger(__name__)

In [None]:
pudl_engine = sa.create_engine(pudl.workspace.setup.get_defaults()['pudl_db'])

In [None]:
from pudl.etl import defs, default_assets, AssetKey
from pudl.helpers import get_asset_group_keys

In [None]:
table_name ="denorm_plants_all_ferc1"
defs.load_asset_value(AssetKey(table_name))

# EIA and FERC 1 Inputs

Generate the FERC and EIA inputs or read in a pickled dataframe.

In [None]:
# set the report years for the experiment
start_report_year = 2018
end_report_year = 2020

In [None]:
inputs = ferc1_eia_match.inputs.InputManager(
    pudl_engine=pudl_engine, start_report_year=start_report_year, end_report_year=end_report_year)

In [None]:
eia_df = inputs.get_eia_input()

In [None]:
ferc_df = inputs.get_ferc_input()

Alternatively, read in pickled dataframes

In [None]:
# ferc_df = pd.read_pickle("../ferc_18_20.pkl")
# eia_df = pd.read_pickle("../plant_parts_18_20.pkl")

In [None]:
# columns in both the EIA and FERC1 dataframe that can be used for similarity search
# report_year is a manual blocking column
# later, some more complex feature engineering could be done
matching_cols = ["plant_name",
                 "utility_name",
                 "installation_year",
                 "construction_year",
                 "fuel_type_code_pudl",
                 "capacity_mw",
                 "report_year"]

In [None]:
ferc_left = ferc_df[matching_cols].reset_index()
eia_right = eia_df[matching_cols].reset_index()

# Blocking Experiments

Goal: Place all the correctly matching record pairs into the candidate set of matches, while making the size of the candidate set as small as possible. This allows for a more efficient subsequent linkage phase.

## TF-IDF + Equal Weight Aggregation
- Attribute vectorization: TF-IDF
- Tuple aggregation: equal weight concatenation
- Similarity search: L2 distance and cosine distance (separate)

In [None]:
col_embedding_dict = {
    "plant_name": "tfidf",
    "utility_name": "tfidf",
    "fuel_type_code_pudl": "tfidf",
    "installation_year": "min_max_scale",
    "construction_year": "min_max_scale",
    "capacity_mw": "min_max_scale"
}
embedder = DataframeEmbedder(left_df=ferc_left, right_df=eia_right, col_embedding_dict=col_embedding_dict)

In [None]:
embedder.embed_dataframes(blocking_col="report_year")

In [None]:
embedder.left_embedding_matrix.shape, embedder.right_embedding_matrix.shape

In [None]:
# should be a key for each value in blocking_col
len(embedder.left_blocks_dict), len(embedder.right_blocks_dict)

In [None]:
df = pd.DataFrame({"a": [1, 2, 1]})
type(df.groupby("a").groups[1])

In [None]:
def run_l2_search(embedder, k):
    # an array of the k best right matches for each left record
    candidate_set = np.empty((len(embedder.left_df), k))
    for block_key in embedder.left_blocks_dict:
        print(f"Conducting search on the {block_key} block")
        left_idx = embedder.left_blocks_dict[block_key].to_numpy()
        right_idx = embedder.right_blocks_dict[block_key].to_numpy()
        sim_searcher = SimilaritySearcher(
            query_embedding_matrix=embedder.left_embedding_matrix[left_idx],
            index_embedding_matrix=embedder.right_embedding_matrix[right_idx],
        )
        block_candidate_set = right_idx[sim_searcher.l2_distance_search(k)]
        candidate_set[left_idx] = block_candidate_set
    return candidate_set

In [None]:
%%time
candidate_set = run_l2_search(embedder, k=5)

In [None]:
%%time
# block on report year
# this manual blocking will get moved into the SimilaritySearch module later
# we want k best matches for each ferc record
candidate_set = np.empty((len(ferc_left), k))
for year in np.arange(start_report_year, end_report_year + 1):
    print(year)
    ferc_idx = ferc_left[ferc_left.report_year == year].index.to_numpy()
    eia_idx = eia_right[eia_right.report_year == year].index.to_numpy()
    year_left = ferc_left_embed[ferc_idx]
    year_right = eia_right_embed[eia_idx]
    year_candidate_set = SimilaritySearch().l2_search(
        query_embeddings=year_left, match_embeddings=year_right, k=k)
    # map back from the year index to the full embedding index
    mapped_idx_candidate_set = eia_idx[year_candidate_set]
    candidate_set[ferc_idx] = mapped_idx_candidate_set

In [None]:
# quick sanity check
ferc_left.iloc[0]

In [None]:
eia_right.iloc[candidate_set[0]]

# Measure Performance

How many of the training matches does a candidate set capture while minimizing the size of the candidate set?

In [None]:
# read in training data
train_df = pd.read_csv("../ferc1_eia_train.csv")

In [None]:
train_df = train_df[(train_df.record_id_ferc1.isin(ferc_df.index)) & (train_df.record_id_eia.isin(eia_df.index))]

In [None]:
len(train_df)

In [None]:
# to do: do this without a for loop, use map to go from record ID to index?
successes = 0
failures = []
for _, row in train_df.iterrows():
    ferc_idx = ferc_left[ferc_left.record_id_ferc1 == row.record_id_ferc1].index[0]
    eia_idx = eia_right[eia_right.record_id_eia == row.record_id_eia].index[0]
    if eia_idx in candidate_set[ferc_idx]:
        successes += 1
    else:
        failures.append((ferc_idx, eia_idx))

In [None]:
successes

In [None]:
len(failures)

In [None]:
successes/len(train_df)

Is there an over emphasis on string columns? Should FTCP not be embedded in this way?

In [None]:
i = 2

In [None]:
# take a look at some failures
ferc_fail_idx = failures[i][0]
eia_correct_idx = failures[i][1]
ferc_left.iloc[ferc_fail_idx]

In [None]:
eia_right.iloc[eia_correct_idx]

In [None]:
eia_right.iloc[candidate_set[ferc_fail_idx]]