# Use Splink to match FERC1 plants to EIA plant parts

This notebook walks through how to use splink to match FERC1 plants to EIA plant parts, as is done in `pudl.analysis.record_linkage.eia_ferc1_record_linkage_splink_model.py`. Splink has several visualizations during the model training process that are helpful for understanding model weights and the input datasets. Thos visualizations are not captured in the PUDL module that implements this model, so this companion notebook provides insight into how to use splink for model development.

The [Splink docs](https://moj-analytical-services.github.io/splink/index.html) include helpful tutorials and the Github issues and discussions are also helpful places to look.

In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
import sqlalchemy as sa
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import pandas as pd

import pudl
from pudl.analysis.record_linkage import eia_ferc1_record_linkage_splink_model as eia_ferc1_splink_model
from pudl.analysis.record_linkage.name_cleaner import CompanyNameCleaner
from pudl.analysis.record_linkage.embed_dataframe import _fill_fuel_type_from_name
from pudl.analysis.record_linkage import eia_ferc1_splink_model_config
from pudl.etl import defs

In [3]:
pudl_engine = sa.create_engine(pudl.workspace.setup.PudlPaths().pudl_db)

# Get model inputs and preprocess

In [26]:
out_ferc1__yearly_all_plants = defs.load_asset_value("out_ferc1__yearly_all_plants")
out_ferc1__yearly_steam_plants_fuel_by_plant_sched402 = defs.load_asset_value("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402")
out_eia__yearly_plant_parts = defs.load_asset_value("out_eia__yearly_plant_parts")

pudl_io_manager: experimental support for parquet enabled. (read=True, write=True)
pudl_io_manager: experimental support for parquet enabled. (read=True, write=True)
pudl_io_manager: experimental support for parquet enabled. (read=True, write=True)


In [5]:
inputs = eia_ferc1_splink_model.get_compiled_input_manager(out_ferc1__yearly_all_plants,
                                                           out_ferc1__yearly_steam_plants_fuel_by_plant_sched402,
                                                           out_eia__yearly_plant_parts)

2024-02-13 15:28:54 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:415 Preparing the FERC1 tables.
2024-02-13 15:29:36 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:820 Updating 77 training records with 1:m plant parts.
2024-02-13 15:29:36 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:717 Restricting training data on years: 2001|2002|2003|2004|2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020|2021|2022|2023


In [6]:
eia_df, ferc_df = eia_ferc1_splink_model.get_input_dfs(inputs)
train_df = eia_ferc1_splink_model.get_training_data_df(inputs)

In [7]:
plant_name_cleaner = CompanyNameCleaner(
    cleaning_rules_list=[
        "replace_amperstand_between_space_by_AND",
        "replace_hyphen_between_spaces_by_single_space",
        "replace_underscore_by_space",
        "replace_underscore_between_spaces_by_single_space",
        "remove_text_puctuation_except_dot",
        "remove_math_symbols",
        "add_space_before_opening_parentheses",
        "add_space_after_closing_parentheses",
        "remove_parentheses",
        "remove_brackets",
        "remove_curly_brackets",
        "enforce_single_space_between_words",
    ]
)
utility_name_cleaner = CompanyNameCleaner()

In [8]:
ferc_df["plant_name"] = plant_name_cleaner.apply_name_cleaning(ferc_df["plant_name"])
ferc_df["utility_name"] = utility_name_cleaner.apply_name_cleaning(ferc_df["utility_name"])
ferc_df["fuel_type_code_pudl"] = _fill_fuel_type_from_name(ferc_df, "fuel_type_code_pudl", "plant_name")

2024-02-13 15:30:24 [    INFO] catalystcoop.pudl.analysis.record_linkage.embed_dataframe:287 Nulls before filling fuel type from name: 30772
2024-02-13 15:30:24 [    INFO] catalystcoop.pudl.analysis.record_linkage.embed_dataframe:294 Nulls after filling fuel type from name: 25371


In [9]:
eia_df["plant_name"] = plant_name_cleaner.apply_name_cleaning(eia_df["plant_name"])
eia_df["utility_name"] = utility_name_cleaner.apply_name_cleaning(eia_df["utility_name"])
eia_df["fuel_type_code_pudl"] = _fill_fuel_type_from_name(eia_df, "fuel_type_code_pudl", "plant_name")

2024-02-13 15:44:14 [    INFO] catalystcoop.pudl.analysis.record_linkage.embed_dataframe:287 Nulls before filling fuel type from name: 25361
2024-02-13 15:44:15 [    INFO] catalystcoop.pudl.analysis.record_linkage.embed_dataframe:294 Nulls after filling fuel type from name: 22806


In [10]:
ferc_df["installation_year"] = pd.to_datetime(ferc_df["installation_year"], format="%Y")
ferc_df["construction_year"] = pd.to_datetime(ferc_df["construction_year"], format="%Y")
eia_df["installation_year"] = pd.to_datetime(eia_df["installation_year"], format="%Y")
eia_df["construction_year"] = pd.to_datetime(eia_df["construction_year"], format="%Y")

In [11]:
cols = eia_ferc1_splink_model.ID_COL + eia_ferc1_splink_model.MATCHING_COLS + eia_ferc1_splink_model.EXTRA_COLS
eia_df = eia_df[cols]
ferc_df = ferc_df[cols]

# Set settings dictionary and create linker

In [12]:
settings_dict = {"link_type": "link_only",
                 "unique_id_column_name": "record_id",
                 "additional_columns_to_retain": ["plant_id_pudl", "utility_id_pudl"]}

In [13]:
linker = DuckDBLinker([eia_df, ferc_df], input_table_aliases = ["eia_df", "ferc_df"], settings_dict=settings_dict)

In [14]:
train_table = linker.register_table(train_df, "training_labels", overwrite=True)

# Data Exploration

In [15]:
linker.completeness_chart(cols=eia_ferc1_splink_model.MATCHING_COLS)

Columns with higher cardinality are better for matching. Note the skew in `fuel_type_code_pudl` which means we'll need to use a term frequency adjustment.

In [23]:
linker.profile_columns(eia_ferc1_splink_model.MATCHING_COLS, top_n=10, bottom_n=5)

# Generate blocking rules

Define blocking rules to reduce the search space of potential candidate pairs that the matching model must consider. See `pudl.analysis.record_linkage.eia_ferc1_splink_model_config` for blocking rule definitions.

From the docs:
- "More generally, we can often specify multiple blocking rules such that it becomes highly implausible that a true match would not meet at least one of these blocking critera. This is the recommended approach in Splink. Generally we would recommend between about 3 and 10, though even more is possible."
- "For linkages in DuckDB on a standard laptop, we suggest using blocking rules that create no more than about 20 million comparisons."

In [59]:
linker.cumulative_num_comparisons_from_blocking_rules_chart(eia_ferc1_splink_model_config.BLOCKING_RULES)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

# Define Comparison Levels

In [61]:
print(eia_ferc1_splink_model_config.plant_name_comparison.human_readable_description)

Comparison 'Exact match vs. Plant_Name within jaro_winkler thresholds 0.9, 0.8, 0.7 vs. anything else' of "plant_name".
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: "plant_name_l" IS NULL OR "plant_name_r" IS NULL
    - 'Exact match plant_name' with SQL rule: "plant_name_l" = "plant_name_r"
    - 'Jaro_winkler_similarity >= 0.9' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.9
    - 'Jaro_winkler_similarity >= 0.8' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.8
    - 'Jaro_winkler_similarity >= 0.7' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.7
    - 'All other comparisons' with SQL rule: ELSE



In [16]:
settings_dict.update({
    "comparisons": eia_ferc1_splink_model_config.COMPARISONS,
    "blocking_rules_to_generate_predictions": eia_ferc1_splink_model_config.BLOCKING_RULES,
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "probability_two_random_records_match": 1/len(eia_df) # this parameter can also be estimated if it's unknown
    }
)

Explanation of probability two random records match calculation:

The EIA dataset has n records and FERC dataset has m records, where n > m. Each FERC record matches to one EIA record, so there are n - m EIA record that don't have a match.

- If I choose a FERC record first then I have a 1/n chance of choosing the matching EIA record
- If I choose an EIA record first then I have a m/n chance of choosing an EIA record that has a FERC match, and then a 1/m chance of choosing the correct matching FERC record. So the probability of choosing two matching records is m/n * 1/m  = 1/n

In either case, the probability is 1/n.

In [17]:
linker.load_settings(settings_dict)

# Estimate Model Parameters

Now that we have specified our linkage model, we need to estimate the probability_two_random_records_match (if not specified in settings dictionary), u, and m parameters.

In [18]:
linker.estimate_u_using_random_sampling(max_pairs=1e7)

----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - plant_name (no m values are trained).
    - utility_name (no m values are trained).
    - construction_year (no m values are trained).
    - installation_year (no m values are trained).
    - capacity_mw (no m values are trained).
    - fuel_type_code_pudl (no m values are trained).
    - net_generation_mwh (no m values are trained).


We can estimate m with either training labels or unsupervised, with Expectation Maximization.

In [19]:
linker.estimate_m_from_pairwise_labels("training_labels")

In [None]:
# if we do it unsupervised, we need to define training blocking rules
# training_blocking_rule_1 = "l.plant_name = r.plant_name"
# training_session_1 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule_1)
# training_session_2 = linker.estimate_parameters_using_expectation_maximisation(block_on(["utility_name", "net_generation_mwh"]))
# training_session_3 = linker.estimate_parameters_using_expectation_maximisation(block_on(["capacity_mw", "fuel_type_code_pudl"]))

In [20]:
linker.match_weights_chart()

In [21]:
linker.m_u_parameters_chart()

In [30]:
model_name = "splink_ferc_eia_demo"

In [31]:
# save model settings to a chosen directory
settings = linker.save_model_to_json(f"./model_settings_{model_name}.json", overwrite=True)

# Make Predictions

In [22]:
# predict matches above a certain threshold match probability or match weight
# df_preds = linker.predict(threshold_match_probability=.25)
df_preds = linker.predict()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [23]:
sorted_preds_df = df_preds.as_pandas_dataframe().sort_values(by="match_probability", ascending=False)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [24]:
best_match_df = sorted_preds_df.rename(columns={"record_id_r": "record_id_ferc1", "record_id_l": "record_id_eia"}).groupby("record_id_ferc1").first()

In [31]:
overwrite_df = eia_ferc1_splink_model.get_best_matches_with_training_data_overwrites(sorted_preds_df, inputs)

2024-02-13 21:54:58 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage_splink_model:247 Metrics before overwrites:
   True positives:  8529
   False positives: 756
   False negatives: 2
   Precision:       0.919
   Recall:          1.0
Precision = of the training data FERC records that the model predicted a match for, this percentage was correct.
Recall = of all of the training data FERC records, the model predicted a match for this percentage.
Metrics before overwrites:
   True positives:  8529
   False positives: 756
   False negatives: 2
   Precision:       0.919
   Recall:          1.0
Precision = of the training data FERC records that the model predicted a match for, this percentage was correct.
Recall = of all of the training data FERC records, the model predicted a match for this percentage.
2024-02-13 21:54:58 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:679 Matches stats:
Percent of training data matches correctly pre

# Evaluate Results

In [50]:
train_df = train_df.rename(columns={"record_id_r": "record_id_ferc1", "record_id_l": "record_id_eia"})

In [40]:
cols = [col + "_l" for col in eia_ferc1_splink_model.MATCHING_COLS]
cols += [col + "_r" for col in eia_ferc1_splink_model.MATCHING_COLS]
extra_cols = ["plant_id_pudl_l", "plant_id_pudl_r", "utility_id_pudl_l", "utility_id_pudl_r"]
cols.sort()
cols = ["record_id_eia", "match_weight", "match_probability"] + cols + extra_cols
best_match_df = best_match_df[cols].reset_index()

In [106]:
def get_splink_true_pos(pred_df, train_df):
    return train_df.merge(
                pred_df,
                how="left",
                on=["record_id_ferc1", "record_id_eia"],
                indicator=True
            )._merge.value_counts()["both"]

# where an incorrect EIA record is predicted for a FERC record
def get_splink_false_pos(pred_df, train_df):
    shared_preds = train_df.merge(
        pred_df,
        how="inner",
        on="record_id_ferc1",
        suffixes=("_true", "_pred")
    )
    return len(shared_preds[shared_preds.record_id_eia_true != shared_preds.record_id_eia_pred])

# in training data but no prediction made
def get_splink_false_neg(pred_df, train_df):
    return train_df.merge(
                pred_df,
                how="left",
                on=["record_id_ferc1"],
                indicator=True
            )._merge.value_counts()["left_only"]

def get_duplicated_eia_plant_part_matches(pred_df):
    return len(pred_df[(pred_df.record_id_eia.notnull()) & (pred_df.record_id_eia.duplicated(keep="first"))])

def get_splink_match_at_threshold(df, threshold):
    return df[df.match_probability >= threshold]

In [131]:
ind = ["splink_.95", "splink_.9", "splink_.75", "splink_.5", "splink_.25"]
data = {"true_pos": [get_splink_true_pos(get_splink_match_at_threshold(best_match_df, threshold=.95), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(best_match_df, threshold=.9), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(best_match_df, threshold=.75), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(best_match_df, threshold=.5), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(best_match_df, threshold=.25), train_df)
                    ],
        "false_pos": [get_splink_false_pos(get_splink_match_at_threshold(best_match_df, threshold=.95), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(best_match_df, threshold=.9), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(best_match_df, threshold=.75), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(best_match_df, threshold=.5), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(best_match_df, threshold=.25), train_df)
                     ],
        "false_neg": [get_splink_false_neg(get_splink_match_at_threshold(best_match_df, threshold=.95), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(best_match_df, threshold=.9), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(best_match_df, threshold=.75), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(best_match_df, threshold=.5), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(best_match_df, threshold=.25), train_df)
                     ]
       }

stats_df = pd.DataFrame(index=ind, data=data)
stats_df.loc[:, "precision"] = stats_df["true_pos"]/(stats_df["true_pos"] + stats_df["false_pos"])
stats_df.loc[:, "recall"] = stats_df["true_pos"]/(stats_df["true_pos"] + stats_df["false_neg"])

In [132]:
stats_df

Unnamed: 0,true_pos,false_pos,false_neg,precision,recall
splink_.95,7691,446,1150,0.945189,0.869924
splink_.9,7764,457,1066,0.944411,0.879275
splink_.75,7975,513,799,0.939562,0.908935
splink_.5,8084,537,666,0.93771,0.923886
splink_.25,8225,557,505,0.936575,0.942153


In [107]:
ind = [ "splink_.9", "splink_.75", "splink_.5", "splink_.25"]
data = {"duplicate_eia_plant_part_matches": [get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(best_match_df, threshold=.9)),
                                             get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(best_match_df, threshold=.75)),
                                             get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(best_match_df, threshold=.5)),
                                             get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(best_match_df, threshold=.25))
                                            ]
       }
dupe_df = pd.DataFrame(index=ind, data=data)

In [108]:
dupe_df

Unnamed: 0,duplicate_eia_plant_part_matches
splink_.9,416
splink_.75,485
splink_.5,533
splink_.25,588


In [None]:
best_match_with_overwrites = eia_ferc1_splink_model.get_best_matches_with_training_data_overwrites(sorted_preds_df, inputs)
connected_df = eia_ferc1_splink_model.get_full_records(best_match_with_overwrites, inputs)

# Look at matches

In [91]:
labels_df = inputs.get_train_df().reset_index()

In [88]:
best_match_df = best_match_df.reset_index()

In [92]:
compare_df = labels_df.merge(best_match_df, how="left", on="record_id_ferc1", suffixes=("_true", "_pred"), indicator=True)

In [93]:
compare_df._merge.value_counts()

_merge
both          8139
left_only     1148
right_only       0
Name: count, dtype: int64

In [94]:
incorrect_matches = compare_df[compare_df.record_id_eia_true != compare_df.record_id_eia_pred]
incorrect_matches[["record_id_ferc1", "record_id_eia_true", "record_id_eia_pred", "match_probability"]].reset_index(drop=True)

Unnamed: 0,record_id_ferc1,record_id_eia_true,record_id_eia_pred,match_probability
0,f1_steam_2018_12_7_3_3,116_1972_2018_plant_operating_year_total_803,116_1973_2018_plant_operating_year_total_803,0.994537
1,f1_steam_2018_12_51_0_1,1239_st_2018_plant_prime_mover_total_5860_retired,,
2,f1_steam_2018_12_176_0_5,126_gt_2018_plant_prime_mover_total_24211,,
3,f1_steam_2018_12_176_0_1,126_1_2018_plant_unit_total_24211,126_st_2018_plant_prime_mover_total_24211,0.998481
4,f1_steam_2018_12_44_1_2,1743_1_2018_plant_unit_total_5109,1743_st_2018_plant_prime_mover_total_5109,1.000000
...,...,...,...,...
1599,f1_steam_2019_12_454_2_1,1393_5_2019_plant_unit_owned_11241,,
1600,f1_steam_2015_12_87_0_5,1404_ct_2015_plant_prime_mover_total_11241,1404_2015_plant_total_11241,0.999066
1601,f1_steam_2015_12_454_0_4,1404_ca_2015_plant_prime_mover_total_11241,1404_2015_plant_total_11241,0.999066
1602,f1_steam_2016_12_454_3_1,8056_2016_plant_total_11241,8056_st_2016_plant_prime_mover_total_11241,0.999999


In [95]:
incorrect_matches.to_parquet("incorrect_matches.parquet")

In [96]:
i = 0
ferc_id = incorrect_matches.record_id_ferc1.iloc[i]
true_eia_id = incorrect_matches.record_id_eia_true.iloc[i]
pred_eia_id = incorrect_matches.record_id_eia_pred.iloc[i]

In [97]:
rec_true = sorted_preds_df[(sorted_preds_df.record_id_r == ferc_id) & (sorted_preds_df.record_id_l == true_eia_id)]
rec_pred = sorted_preds_df[(sorted_preds_df.record_id_r == ferc_id) & (sorted_preds_df.record_id_l == pred_eia_id)]

In [98]:
rec_true = rec_true.to_dict(orient="records")
linker.waterfall_chart(rec_true, filter_nulls=False)

In [127]:
rec_pred = rec_pred.to_dict(orient="records")
linker.waterfall_chart(rec_pred, filter_nulls=False)