In [1]:
%load_ext autoreload
%autoreload 3

In [14]:
import logging
import os
import time

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
import sqlalchemy as sa

import ferc1_eia_match
from ferc1_eia_match import config
from ferc1_eia_match.metrics import blocking
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder, SimilaritySearcher
import pudl

In [3]:
logger = logging.getLogger(__name__)

In [4]:
pudl_engine = sa.create_engine(pudl.workspace.setup.PudlPaths().pudl_db)

# EIA and FERC 1 Inputs

Generate the FERC and EIA inputs or read in a pickled dataframe.

In [5]:
# set configuration for model
model_config = {
    "inputs": {
        "start_year": None,
        "end_year": None,
    },
    "embedding": {
        "column_transformers": [
            ("plant_name", TfidfVectorizer(), "plant_name"),
            ("utility_name", TfidfVectorizer(), "utility_name"),
            ("fuel_type_code_pudl", TfidfVectorizer(), "fuel_type_code_pudl"),
            ("installation_year", MinMaxScaler(), ["installation_year"]),
            ("construction_year", MinMaxScaler(), ["construction_year"]),
            ("capacity_mw", MinMaxScaler(), ["capacity_mw"]),
        ],
        "matching_cols": [
            "plant_name",
            "utility_name",
            "installation_year",
            "construction_year",
            "fuel_type_code_pudl",
            "capacity_mw",
            "report_year"
        ],
        "blocking_col": "report_year"
    },
    "similarity_search": {"distance_metric": "l2_distance_search"},
}
model_config = config.Model(**model_config)

In [6]:
inputs = ferc1_eia_match.inputs.InputManager(
    pudl_engine=pudl_engine,
    start_report_year=model_config.inputs.start_year,
    end_report_year=model_config.inputs.end_year)

In [7]:
ferc_df = inputs.get_ferc_input()

INFO:ferc1_eia_match.inputs:Creating FERC plants input.


In [8]:
eia_df = inputs.get_eia_input()

INFO:ferc1_eia_match.inputs:Creating the EIA plant parts list input.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> ec80dd91891a
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  ->

TODO: more feature engineering - use something besides fuel_type_code_pudl?

In [9]:
ferc_left = ferc_df[model_config.embedding.matching_cols].reset_index()
eia_right = eia_df[["record_id_eia"] + model_config.embedding.matching_cols]

# Blocking Experiments

Goal: Place all the correctly matching record pairs into the candidate set of matches, while making the size of the candidate set as small as possible. This allows for a more efficient subsequent linkage phase.

## Conduct Experiment
Example:
- Column vectorization: TF-IDF
- Tuple aggregation: equal weight concatenation
- Similarity search: L2 distance

In [10]:
embedder = DataframeEmbedder(left_df=ferc_left,
                             right_df=eia_right,
                             column_transformers=model_config.embedding.column_transformers,
                             blocking_col=model_config.embedding.blocking_col)

In [11]:
embedder.embed_dataframes()

In [12]:
# probably makes sense to try PCA
embedder.left_embedding_matrix.shape, embedder.right_embedding_matrix.shape

((54415, 18675), (883801, 18675))

In [21]:
left_m = np.asarray(embedder.left_embedding_matrix.todense())

In [None]:
m = PCA(copy=False, n_components=None).fit_transform(left_m)

In [None]:
m

In [13]:
# there should be a key for each unique value in blocking_col
# does it make sense that there's FERC data for 1994-2000 but no EIA data?
len(embedder.left_blocks_dict), len(embedder.right_blocks_dict)

(21, 21)

In [None]:
searcher = SimilaritySearcher(query_embedding_matrix=embedder.left_embedding_matrix,
                   menu_embedding_matrix=embedder.right_embedding_matrix,
                   query_blocks_dict=embedder.left_blocks_dict,
                   menu_blocks_dict=embedder.right_blocks_dict)

In [None]:
k = 5

In [None]:
# generate candidate set
candidate_set = searcher.run_candidate_pair_search(k=k)

In [None]:
# run experiment
# TODO: generate training data from inputs module
train_df = pd.read_csv("inputs/ferc1_eia_true_gran_train_18_20.csv")
ks = [5, 10, 15, 20, 25, 30, 40, 50]
blocking.measure_blocking(searcher.run_candidate_pair_search, ks, train_df, ferc_left, eia_right, model_config, mlruns="../mlruns")

In [None]:
u, c = np.unique(candidate_set, return_counts=True)

In [None]:
len(ferc_df), len(eia_df), candidate_set.shape, len(u)

# Create candidate dataframe

In [None]:
ferc_cands = ferc_df.reset_index(names="block_num")

In [None]:
eia_cands = eia_df.iloc[candidate_set.flatten()]
block_nums = np.repeat(np.arange(len(ferc_df)), k)
eia_cands.loc[:, "block_num"] = block_nums

# Cross Validation

The time goes down at k=10 because FAISS switches similarity search implementations to a multithreaded version.

What type of records are we not getting correct?

Problems/Fixes:
- Often they have one or more column that are null.
- There are clues for FTCP in the name - "southwestern diesel", try to impute in that way
  - Easy idea is to search for a FTCP in the name - "moline conventional hydro (4 units @ 900kw each)"
- No white space around special characters - "south carolina electric&gas company"
- If one of `installation_year` or `construction_year` is missing, fill in with the other? Makes more sense than imputing from an average
- strip apostrophes? "queen's creek" 

In [None]:
wrong_idx = np.where(~pair_is_correct)[0]

In [None]:
len(wrong_idx)

In [None]:
i = 20

In [None]:
train_rec = train_df_with_idx.iloc[wrong_idx[i]]

In [None]:
train_rec.notes

In [None]:
ferc_left.iloc[[train_rec.ferc_index]]

In [None]:
eia_right.iloc[[train_rec.eia_index]]

In [None]:
eia_cands = candidate_set[train_rec.ferc_index]
eia_right.iloc[eia_cands]