In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import pandas as pd
import numpy as np
import sqlalchemy as sa

from dagster import AssetKey
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder, SimilaritySearcher
from ferc1_eia_match.name_cleaner import CompanyNameCleaner
from ferc1_eia_match import config
import pudl
from pudl.etl import defs
from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_template_library as ctl

In [None]:
pudl_engine = sa.create_engine(pudl.workspace.setup.PudlPaths().pudl_db)

# Get and Clean Inputs

FERC: get the FERC plant and utility information

EIA: get the plant and utility information for unmapped EIA utilities

- There's no FERC plant location data and there's no FERC utility address information, so the match should just be done on plant name and utility name

In [None]:
ferc_df = pd.read_sql("denorm_plants_utilities_ferc1", pudl_engine)

In [None]:
eia_orig = pd.read_sql("denorm_ownership_eia860", pudl_engine)
unmapped_df = pd.read_csv("inputs/missing_utility_id_eia_in_utilities_eia.csv")

In [None]:
eia_df = eia_orig[eia_orig.owner_utility_id_eia.isin(unmapped_df.utility_id_eia)][["owner_utility_id_eia", "plant_name_eia", "plant_id_pudl", "owner_name"]]
eia_df = eia_df.reset_index(drop=True)

In [None]:
utility_name_cleaner = CompanyNameCleaner()
eia_df = utility_name_cleaner.get_clean_df(eia_df, "owner_name", "utility_name")
ferc_df = utility_name_cleaner.get_clean_df(ferc_df, "utility_name_ferc1", "utility_name")

In [None]:
# default rules except keep words in parentheses
plant_name_cleaner = CompanyNameCleaner(
    cleaning_rules_list=[
        "replace_amperstand_between_space_by_AND",
        "replace_hyphen_between_spaces_by_single_space",
        "replace_underscore_by_space",
        "replace_underscore_between_spaces_by_single_space",
        "remove_text_puctuation_except_dot",
        "remove_math_symbols",
        "add_space_before_opening_parentheses",
        "add_space_after_closing_parentheses",
        "remove_parentheses",
        "remove_brackets",
        "remove_curly_brackets",
        "enforce_single_space_between_words",
    ]
)
eia_df = plant_name_cleaner.get_clean_df(eia_df, "plant_name_eia", "plant_name")
ferc_df = plant_name_cleaner.get_clean_df(ferc_df, "plant_name_ferc1", "plant_name")

In [None]:
eia_df.head(3)

In [None]:
ferc_df.head(3)

# Run Blocking

In [None]:
# set configuration for model
embedding_config_dict = {
    "embedding_map": {
        "plant_name": {"embedding_type": "tfidf_vectorize"},
        "utility_name": {"embedding_type": "tfidf_vectorize"},
    },
    "matching_cols": [
        "plant_name",
        "utility_name",
    ],
}
embedding_config = config.EmbeddingConfig(**embedding_config_dict)

In [None]:
embedder = DataframeEmbedder(
    left_df = eia_df, 
    right_df = ferc_df, 
    embedding_map = embedding_config.embedding_map)

In [None]:
embedder.embed_dataframes()

In [None]:
embedder.left_embedding_matrix.shape, embedder.right_embedding_matrix.shape

In [None]:
searcher = SimilaritySearcher(query_embedding_matrix=embedder.left_embedding_matrix,
                   menu_embedding_matrix=embedder.right_embedding_matrix,
                   query_blocks_dict=embedder.left_blocks_dict,
                   menu_blocks_dict=embedder.right_blocks_dict)

In [None]:
k = 10

In [None]:
%%time
cand_set = searcher.run_candidate_pair_search(k)

In [None]:
cand_set.shape

In [None]:
i = 1

In [None]:
pd.concat([eia_df.iloc[[i]][["utility_name", "plant_name"]],
           ferc_df.iloc[cand_set[i]][["utility_name", "plant_name"]]])

In [None]:
u, c = np.unique(cand_set, return_counts=True)

In [None]:
len(eia_df), len(ferc_df), cand_set.shape, len(u)

For now, we're just going to jankily block by excluding the FERC records that don't appear in any of blocks. Later, fix the problem by combining blocks whenever a record shows up in multiple blocks.

In [None]:
# eia_df = eia_df.reset_index(names="block_num")
# block_nums = np.repeat(np.arange(len(eia_df)), k)
# ferc_df.loc[cand_set.flatten().astype(int), "block_num"] = block_nums

In [None]:
ferc_df = ferc_df.iloc[u]

# Match With Splink

TODO: add in blocking with block_num

In [None]:
matching_cols = ["plant_name", "utility_name"]
extra_cols = ["plant_id_pudl", "index"]

In [None]:
eia_df = eia_df.reset_index()
ferc_df = ferc_df.reset_index()

In [None]:
settings_dict = {"link_type": "link_only",
                 "unique_id_column_name": "index",
                 "additional_columns_to_retain": ["plant_id_pudl"]}

In [None]:
linker = DuckDBLinker(
    [eia_df[matching_cols + extra_cols], 
     ferc_df[matching_cols + extra_cols]], 
    input_table_aliases = ["eia_df", "ferc_df"], 
    settings_dict=settings_dict)

In [None]:
linker.profile_columns(matching_cols, top_n=10, bottom_n=5)

In [None]:
# TODO: try with leveshtein thresholds
plant_name_comparison = ctl.name_comparison("plant_name",
                                            jaro_winkler_thresholds=[.8, .7, .6],
                                            include_exact_match_level=False)
utility_name_comparison = ctl.name_comparison("utility_name",
                                              jaro_winkler_thresholds=[.9, .7],
                                              damerau_levenshtein_thresholds=[4, 5],
                                              include_exact_match_level=False)

In [None]:
settings_dict.update({
    "comparisons": [
        plant_name_comparison,
        utility_name_comparison,
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "probability_two_random_records_match": 1/len(eia_df) # is this correct?
    }
)

In [None]:
linker.load_settings(settings_dict)

In [None]:
%%time
linker.estimate_u_using_random_sampling(max_pairs=1e6)

In [None]:
training_blocking_rule_1 = "jaro_winkler_similarity(l.plant_name, r.plant_name) >= 0.6"
training_session_1 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule_1)

In [None]:
training_blocking_rule_2 = "jaro_winkler_similarity(l.utility_name, r.utility_name) >= 0.8"
training_session_2 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule_2)

In [None]:
# add more jaro_winkler thresholds in?
linker.match_weights_chart()

In [None]:
linker.m_u_parameters_chart()

In [None]:
linker.unlinkables_chart()

In [None]:
df_preds = linker.predict(threshold_match_probability=.1)

In [None]:
sorted_preds_df = df_preds.as_pandas_dataframe().sort_values(by="match_probability", ascending=False)

In [None]:
sorted_preds_df