In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
import logging
import os
import time

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
import sqlalchemy as sa

import ferc1_eia_match
from ferc1_eia_match import config
from ferc1_eia_match.metrics import blocking
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder, SimilaritySearcher
import pudl

In [3]:
logger = logging.getLogger(__name__)

In [4]:
pudl_engine = sa.create_engine(pudl.workspace.setup.PudlPaths().pudl_db)

# EIA and FERC 1 Inputs

Generate the FERC and EIA inputs or read in a pickled dataframe.

In [79]:
# set configuration for model
model_config = {
    "inputs": {
        "start_year": 2011,
        "end_year": 2021,
    },
    "embedding": {
        "column_transformers": [
            ("plant_name", TfidfVectorizer(), "plant_name"),
            ("utility_name", TfidfVectorizer(), "utility_name"),
            ("fuel_type_code_pudl", TfidfVectorizer(), "fuel_type_code_pudl"),
            ("installation_year", MinMaxScaler(), ["installation_year"]),
            ("construction_year", MinMaxScaler(), ["construction_year"]),
            ("capacity_mw", MinMaxScaler(), ["capacity_mw"]),
        ],
        "matching_cols": [
            "plant_name",
            "utility_name",
            "installation_year",
            "construction_year",
            "fuel_type_code_pudl",
            "capacity_mw",
            "report_year"
        ],
        "blocking_col": "report_year"
    },
    "similarity_search": {"distance_metric": "l2_distance_search"},
}
model_config = config.Model(**model_config)

In [80]:
inputs = ferc1_eia_match.inputs.InputManager(
    pudl_engine=pudl_engine,
    start_report_year=model_config.inputs.start_year,
    end_report_year=model_config.inputs.end_year)

In [81]:
ferc_df = inputs.get_ferc_input()

INFO:ferc1_eia_match.inputs:Creating FERC plants input.


In [None]:
eia_df = inputs.get_eia_input()

TODO: more feature engineering - use something besides fuel_type_code_pudl?

In [9]:
ferc_left = ferc_df[model_config.embedding.matching_cols].reset_index()
eia_right = eia_df[["record_id_eia"] + model_config.embedding.matching_cols]

# Blocking Experiments

Goal: Place all the correctly matching record pairs into the candidate set of matches, while making the size of the candidate set as small as possible. This allows for a more efficient subsequent linkage phase.

## Conduct Experiment
Example:
- Column vectorization: TF-IDF
- Tuple aggregation: equal weight concatenation
- Similarity search: L2 distance

In [41]:
embedder = DataframeEmbedder(left_df=ferc_left,
                             right_df=eia_right,
                             column_transformers=model_config.embedding.column_transformers,
                             blocking_col=model_config.embedding.blocking_col)

In [42]:
embedder.embed_dataframes()

In [45]:
# probably makes sense to try PCA
embedder.left_embedding_matrix.shape, embedder.right_embedding_matrix.shape

((20466, 16567), (454198, 16567))

In [46]:
# there should be a key for each unique value in blocking_col
# does it make sense that there's FERC data for 1994-2000 but no EIA data?
len(embedder.left_blocks_dict), len(embedder.right_blocks_dict)

(11, 11)

Seems like the full embedding matrices can't fit into memory for the similarity search step (or maybe that's just my computer), try using PCA?

In [21]:
left_m = np.asarray(embedder.left_embedding_matrix.todense())

In [None]:
left_m = PCA(copy=False, n_components=None).fit_transform(left_m)

In [None]:
right_m = PCA(copy=False, n_components=None).fit_transform(left_m)

In [47]:
searcher = SimilaritySearcher(query_embedding_matrix=embedder.left_embedding_matrix,
                   menu_embedding_matrix=embedder.right_embedding_matrix,
                   query_blocks_dict=embedder.left_blocks_dict,
                   menu_blocks_dict=embedder.right_blocks_dict)

In [48]:
k = 10

In [49]:
# generate candidate set
candidate_set, distances = searcher.run_candidate_pair_search(k=k)

Conducting search for candidate pairs on the 2016 block
Conducting search for candidate pairs on the 2017 block
Conducting search for candidate pairs on the 2018 block
Conducting search for candidate pairs on the 2019 block
Conducting search for candidate pairs on the 2020 block
Conducting search for candidate pairs on the 2021 block
Conducting search for candidate pairs on the 2011 block
Conducting search for candidate pairs on the 2012 block
Conducting search for candidate pairs on the 2013 block
Conducting search for candidate pairs on the 2014 block
Conducting search for candidate pairs on the 2015 block


In [None]:
train_df = pudl.analysis.ferc1_eia_record_linkage.prep_train_connections(eia_df,
                                                                         start_report_year,
                                                                         end_report_year)

In [None]:
# run experiment
ks = [5, 10, 15, 20, 25, 30, 40, 50]
blocking.measure_blocking(searcher.run_candidate_pair_search, ks, train_df, ferc_left, eia_right, model_config, mlruns="../mlruns")

In [50]:
u, c = np.unique(candidate_set, return_counts=True)

In [58]:
len(ferc_df), len(eia_df), candidate_set.shape, len(u)

(54415, 883801, (20466, 5), 72321)

# Create candidate dataframe

In [78]:
# this can be reused/generalized from the FERC to FERC classifier
# but for now, copy and pasting until that gets merged in
def revert_filled_in_string_nulls(df: pd.DataFrame) -> pd.DataFrame:
    """Revert the filled nulls from string columns.
    Many columns that are used for the classification in
    :func:`plants_steam_assign_plant_ids` have many nulls. The classifier can't handle
    nulls well, so we filled in nulls with empty strings for string columns. This
    function replaces empty strings with null values for specific columns that are known
    to contain empty strings introduced for the classifier.
    """
    for col in [
        "plant_type",
        "construction_type",
        "fuel_type_code_pudl",
        "primary_fuel_by_cost",
        "primary_fuel_by_mmbtu",
    ]:
        if col in df.columns:
            # the replace to_replace={column_name: {"", pd.NA}} mysteriously doesn't work.
            df[col] = df[col].replace(
                to_replace=[""],
                value=pd.NA,
            )
    return df

# not sure it's actually a good idea to replace all 0's with nulls
def revert_filled_in_float_nulls(df: pd.DataFrame) -> pd.DataFrame:
    """Revert the filled nulls from float columns.
    Many columns that are used for the classification in
    :func:`plants_steam_assign_plant_ids` have many nulls. The classifier can't handle
    nulls well, so we filled in nulls with zeros for float columns. This function
    replaces zeros with nulls for all float columns.
    """
    float_cols = list(df.select_dtypes(include=[float]))
    if float_cols:
        df.loc[:, float_cols] = df.loc[:, float_cols].replace(0, np.nan)
    return df

In [None]:
ferc_df = revert_filled_in_string_nulls(ferc_df)
eia_df = revert_filled_in_float_nulls(eia_df)

In [56]:
ferc_cands = ferc_df.reset_index(names="block_num")

In [59]:
eia_cands = eia_df.iloc[candidate_set.flatten()]
block_nums = np.repeat(np.arange(len(ferc_df)), k)
eia_cands.loc[:, "block_num"] = block_nums

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eia_cands.loc[:, "block_num"] = block_nums


In [63]:
eia_cands.to_parquet("inputs/eia_candidates_11_21_k_5.parquet")

In [64]:
ferc_cands.to_parquet("inputs/ferc_candidates_11_21_k_5.parquet")

# Cross Validation

The time goes down at k=10 because FAISS switches similarity search implementations to a multithreaded version.

What type of records are we not getting correct?

Problems/Fixes:
- Often they have one or more column that are null.
- There are clues for FTCP in the name - "southwestern diesel", try to impute in that way
  - Easy idea is to search for a FTCP in the name - "moline conventional hydro (4 units @ 900kw each)"
- No white space around special characters - "south carolina electric&gas company"
- If one of `installation_year` or `construction_year` is missing, fill in with the other? Makes more sense than imputing from an average
- strip apostrophes? "queen's creek" 

In [None]:
wrong_idx = np.where(~pair_is_correct)[0]

In [None]:
len(wrong_idx)

In [None]:
i = 20

In [None]:
train_rec = train_df_with_idx.iloc[wrong_idx[i]]

In [None]:
train_rec.notes

In [None]:
ferc_left.iloc[[train_rec.ferc_index]]

In [None]:
eia_right.iloc[[train_rec.eia_index]]

In [None]:
eia_cands = candidate_set[train_rec.ferc_index]
eia_right.iloc[eia_cands]