In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
import logging
import os
import time

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
import sqlalchemy as sa

import ferc1_eia_match
from ferc1_eia_match import config
from ferc1_eia_match.metrics import blocking
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder, SimilaritySearcher
import pudl


* 'schema_extra' has been renamed to 'json_schema_extra'


In [3]:
logger = logging.getLogger(__name__)

In [4]:
pudl_engine = sa.create_engine(pudl.workspace.setup.PudlPaths().pudl_db)

# EIA and FERC 1 Inputs

Generate the FERC and EIA inputs or read in a pickled dataframe.

In [12]:
# set configuration for model
model_config = {
    "inputs": {
        "start_year": 2001,
        "end_year": 2011,
    },
    "embedding": {
        "column_transformers": [
            ("plant_name", TfidfVectorizer(), "plant_name"),
            ("utility_name", TfidfVectorizer(), "utility_name"),
            ("fuel_type_code_pudl", TfidfVectorizer(), "fuel_type_code_pudl"),
            ("installation_year", MinMaxScaler(), ["installation_year"]),
            ("construction_year", MinMaxScaler(), ["construction_year"]),
            ("capacity_mw", MinMaxScaler(), ["capacity_mw"]),
        ],
        "matching_cols": [
            "plant_name",
            "utility_name",
            "installation_year",
            "construction_year",
            "fuel_type_code_pudl",
            "capacity_mw",
            "report_year"
        ],
        "blocking_col": "report_year"
    },
    "similarity_search": {"distance_metric": "l2_distance_search"},
}
model_config = config.Model(**model_config)

In [13]:
inputs = ferc1_eia_match.inputs.InputManager(
    pudl_engine=pudl_engine,
    start_report_year=model_config.inputs.start_year,
    end_report_year=model_config.inputs.end_year)

In [14]:
ferc_df = inputs.get_ferc_input()

INFO:ferc1_eia_match.inputs:Creating FERC plants input.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a

In [15]:
eia_df = inputs.get_eia_input()

INFO:ferc1_eia_match.inputs:Creating the EIA plant parts list input.


In [16]:
train_df = inputs.get_training_data()

INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a
INFO:alembic.runtime.migration:Context impl SQLiteImpl.

In [17]:
ferc_df.report_year.max(), ferc_df.report_year.min(), eia_df.report_year.max(), eia_df.report_year.min()

(2011, 2001, 2011, 2001)

TODO: more feature engineering - use something besides fuel_type_code_pudl?

In [18]:
ferc_left = ferc_df[model_config.embedding.matching_cols].reset_index()
eia_right = eia_df[["record_id_eia"] + model_config.embedding.matching_cols].reset_index(drop=True)

# Blocking Experiments

Goal: Place all the correctly matching record pairs into the candidate set of matches, while making the size of the candidate set as small as possible. This allows for a more efficient subsequent linkage phase.

## Conduct Experiment
Example:
- Column vectorization: TF-IDF
- Tuple aggregation: equal weight concatenation
- Similarity search: L2 distance

In [19]:
embedder = DataframeEmbedder(left_df=ferc_left,
                             right_df=eia_right,
                             column_transformers=model_config.embedding.column_transformers,
                             blocking_col=model_config.embedding.blocking_col)

In [20]:
embedder.embed_dataframes()

In [21]:
# probably makes sense to try PCA
embedder.left_embedding_matrix.shape, embedder.right_embedding_matrix.shape

((19614, 10236), (374839, 10236))

In [22]:
# there should be a key for each unique value in blocking_col
# does it make sense that there's FERC data for 1994-2000 but no EIA data?
len(embedder.left_blocks_dict), len(embedder.right_blocks_dict)

(11, 11)

Seems like the full embedding matrices can't fit into memory for the similarity search step, try using PCA?

In [23]:
searcher = SimilaritySearcher(query_embedding_matrix=embedder.left_embedding_matrix,
                   menu_embedding_matrix=embedder.right_embedding_matrix,
                   query_blocks_dict=embedder.left_blocks_dict,
                   menu_blocks_dict=embedder.right_blocks_dict)

In [24]:
k = 25

In [25]:
# generate candidate set
candidate_set, distances = searcher.run_candidate_pair_search(k=k)

Conducting search for candidate pairs on the 2001 block
Conducting search for candidate pairs on the 2002 block
Conducting search for candidate pairs on the 2003 block
Conducting search for candidate pairs on the 2004 block
Conducting search for candidate pairs on the 2005 block
Conducting search for candidate pairs on the 2006 block
Conducting search for candidate pairs on the 2007 block
Conducting search for candidate pairs on the 2008 block
Conducting search for candidate pairs on the 2009 block
Conducting search for candidate pairs on the 2010 block
Conducting search for candidate pairs on the 2011 block


In [26]:
# temp while experiment tracking is broken
train_df_with_idx = train_df.merge(
    ferc_left.reset_index(names="ferc_index")[["record_id_ferc1", "ferc_index"]],
    how="inner",
    on="record_id_ferc1",
)
train_df_with_idx = train_df_with_idx.merge(
    eia_right.reset_index(names="eia_index")[["record_id_eia", "eia_index"]],
    how="inner",
    on="record_id_eia",
)
ferc_train_idx = train_df_with_idx.ferc_index
eia_train_idx = train_df_with_idx.eia_index

# Compute % that capture match in candidate set
pair_is_correct = np.in1d(eia_train_idx, candidate_set[ferc_train_idx])
n_correct_pairs = np.sum(pair_is_correct)

print("percent_capture", n_correct_pairs / len(train_df_with_idx))

# Compute % that predict match as first value in candidate set
first_match_is_correct = np.in1d(
    eia_train_idx, candidate_set[ferc_train_idx][:, 0]
)
n_first_match_correct = np.sum(first_match_is_correct)

print("percent_first_match", n_first_match_correct / len(train_df_with_idx))

percent_capture 0.963888201830324
percent_first_match 0.7195152114766262


In [None]:
# run experiment
# ks = [5, 10, 15, 20, 25, 30, 40, 50]
ks = [5, 10]
blocking.measure_blocking(searcher.run_candidate_pair_search, ks, train_df, ferc_left, eia_right, model_config, mlruns="../mlruns")

In [34]:
u, c = np.unique(candidate_set, return_counts=True)

In [35]:
pd.DataFrame({"val": u, "count": c}).sort_values(by="count", ascending=False)

Unnamed: 0,val,count
18858,62683.0,312
102053,374834.0,299
102054,374835.0,289
20043,65647.0,262
102055,374836.0,229
...,...,...
54197,189067.0,1
54198,189068.0,1
54199,189069.0,1
54200,189070.0,1


In [36]:
len(ferc_df), len(eia_df), candidate_set.shape, len(u)

(19614, 374839, (19614, 25), 102058)

# Create candidate dataframe

In [27]:
# this can be reused/generalized from the FERC to FERC classifier
# but for now, copy and pasting until that gets merged in
def revert_filled_in_string_nulls(df: pd.DataFrame) -> pd.DataFrame:
    """Revert the filled nulls from string columns.
    Many columns that are used for the classification in
    :func:`plants_steam_assign_plant_ids` have many nulls. The classifier can't handle
    nulls well, so we filled in nulls with empty strings for string columns. This
    function replaces empty strings with null values for specific columns that are known
    to contain empty strings introduced for the classifier.
    """
    for col in [
        "plant_type",
        "construction_type",
        "fuel_type_code_pudl",
        "primary_fuel_by_cost",
        "primary_fuel_by_mmbtu",
    ]:
        if col in df.columns:
            # the replace to_replace={column_name: {"", pd.NA}} mysteriously doesn't work.
            df[col] = df[col].replace(
                to_replace=[""],
                value=pd.NA,
            )
    return df

# not sure it's actually a good idea to replace all 0's with nulls
def revert_filled_in_float_nulls(df: pd.DataFrame) -> pd.DataFrame:
    """Revert the filled nulls from float columns.
    Many columns that are used for the classification in
    :func:`plants_steam_assign_plant_ids` have many nulls. The classifier can't handle
    nulls well, so we filled in nulls with zeros for float columns. This function
    replaces zeros with nulls for all float columns.
    """
    float_cols = list(df.select_dtypes(include=[float]))
    if float_cols:
        df.loc[:, float_cols] = df.loc[:, float_cols].replace(0, np.nan)
    return df

def revert_nulls_custom_cols(df: pd.DataFrame, column_names: list[str], null_value = 0):
    df.loc[:, column_names] = df.loc[:, column_names].replace(null_value, np.nan)
    return df

In [28]:
ferc_df_og = revert_filled_in_string_nulls(ferc_df)
ferc_df_og = revert_filled_in_float_nulls(ferc_df_og)
ferc_df_og = revert_nulls_custom_cols(ferc_df_og, ["installation_year", "construction_year"], 0)
eia_df_og = revert_filled_in_string_nulls(eia_df)
eia_df_og = revert_filled_in_float_nulls(eia_df_og)
eia_df_og = revert_nulls_custom_cols(eia_df_og, ["installation_year", "construction_year"], 0)

In [29]:
ferc_cands = ferc_df_og.reset_index(names="record_id_ferc1")

In [30]:
ferc_cands = ferc_cands.reset_index(names="block_num")

In [31]:
eia_cands = eia_df_og.iloc[candidate_set.flatten()].copy()
block_nums = np.repeat(np.arange(len(ferc_df)), k)
eia_cands.loc[:, "block_num"] = block_nums

In [32]:
eia_cands.to_parquet(f"inputs/eia_candidates_01_11_k_{k}.parquet")

In [33]:
ferc_cands.to_parquet(f"inputs/ferc_candidates_01_11_k_{k}.parquet")

# Cross Validation

The time goes down at k=10 because FAISS switches similarity search implementations to a multithreaded version.

What type of records are we not getting correct?

Problems/Fixes:
- Often they have one or more column that are null.
- There are clues for FTCP in the name - "southwestern diesel", try to impute in that way
  - Easy idea is to search for a FTCP in the name - "moline conventional hydro (4 units @ 900kw each)"
- No white space around special characters - "south carolina electric&gas company"
- If one of `installation_year` or `construction_year` is missing, fill in with the other? Makes more sense than imputing from an average
- strip apostrophes? "queen's creek" 

In [None]:
wrong_idx = np.where(~pair_is_correct)[0]

In [None]:
len(wrong_idx)

In [None]:
i = 20

In [None]:
train_rec = train_df_with_idx.iloc[wrong_idx[i]]

In [None]:
train_rec.notes

In [None]:
ferc_left.iloc[[train_rec.ferc_index]]

In [None]:
eia_right.iloc[[train_rec.eia_index]]

In [None]:
eia_cands = candidate_set[train_rec.ferc_index]
eia_right.iloc[eia_cands]