In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_level_library as cll
import splink.duckdb.comparison_template_library as ctl
from splink.comparison import Comparison
import sqlalchemy as sa
import pudl

import ferc1_eia_match

# Setup

Read in FERC1 and EIA inputs (output of candidate set creation set). 

In [3]:
k = 25

In [4]:
eia_full = pd.read_parquet(f"inputs/eia_candidates_12_22_k_{k}.parquet")
ferc_full = pd.read_parquet(f"inputs/ferc_candidates_12_22_k_{k}.parquet")

In [5]:
eia_full = pd.concat([eia_full, pd.read_parquet(f"inputs/eia_candidates_01_11_k_{k}.parquet")])
ferc_full = pd.concat([ferc_full, pd.read_parquet(f"inputs/ferc_candidates_01_11_k_{k}.parquet")])

In [6]:
eia_full = eia_full.set_index("record_id_eia")
ferc_full = ferc_full.set_index("record_id_ferc1")

In [7]:
shared_cols = list(set(eia_full.columns) & set(ferc_full.columns))

In [8]:
eia_full[shared_cols].isnull().sum().sort_values()

plant_id_pudl               0
report_year                 0
block_num                   0
utility_name                0
utility_id_pudl             0
plant_name                  0
capacity_mw             14160
installation_year       56749
construction_year       56749
net_generation_mwh     169322
capacity_factor        169713
fuel_type_code_pudl    193016
heat_rate_mmbtu_mwh    801316
total_mmbtu            801973
fuel_cost_per_mmbtu    811564
total_fuel_cost        817116
fuel_cost_per_mwh      823063
dtype: int64

In [9]:
ferc_full[shared_cols].isnull().sum().sort_values()

plant_id_pudl              0
report_year                0
block_num                  0
utility_name               0
utility_id_pudl            0
plant_name                 0
construction_year       1968
capacity_mw             2276
net_generation_mwh      4320
installation_year      13830
capacity_factor        14609
fuel_type_code_pudl    18609
fuel_cost_per_mwh      21546
total_fuel_cost        22159
total_mmbtu            22647
heat_rate_mmbtu_mwh    22957
fuel_cost_per_mmbtu    23229
dtype: int64

TODO: Try experimenting with using more or different columns.

In [10]:
# not all used
matching_cols = ["plant_name",
                 "utility_name",
                 "fuel_type_code_pudl",
                 "installation_year",
                 "construction_year",
                 "capacity_mw",
                 "net_generation_mwh",
                 "capacity_factor",
                 "total_mmbtu",
                 "total_fuel_cost",
                 "heat_rate_mmbtu_mwh",
                 "fuel_cost_per_mmbtu"
                ]
# retain these columns either for blocking or validation, not going to match with these
extra_cols = ["plant_id_pudl", "utility_id_pudl", "report_year", "block_num"]

In [11]:
ferc_df = ferc_full[matching_cols + extra_cols].reset_index().rename(columns={"record_id_ferc1": "record_id"})
eia_df = eia_full[matching_cols + extra_cols].reset_index().rename(columns={"record_id_eia": "record_id"})

In [12]:
eia_df["net_generation_mwh"] = eia_df["net_generation_mwh"].round(2)
ferc_df["net_generation_mwh"] = ferc_df["net_generation_mwh"].round(2)
eia_df["capacity_factor"] = eia_df["capacity_factor"].round(2)
ferc_df["capacity_factor"] = ferc_df["capacity_factor"].round(2)

In [13]:
def revert_nulls_custom_cols(df: pd.DataFrame, column_names: list[str], null_value = 0):
    df.loc[:, column_names] = df.loc[:, column_names].replace(null_value, np.nan)
    return df

In [14]:
# temp, fix this
eia_df = revert_nulls_custom_cols(eia_df, column_names=["net_generation_mwh", "capacity_factor"], null_value=0.0)
ferc_df = revert_nulls_custom_cols(ferc_df, column_names=["net_generation_mwh", "capacity_factor"], null_value=0.0)
eia_df.loc[eia_df.net_generation_mwh < 0, "net_generation_mwh"] = np.nan
ferc_df.loc[ferc_df.net_generation_mwh < 0, "net_generation_mwh"] = np.nan
eia_df.loc[eia_df.capacity_factor < 0, "capacity_factor"] = np.nan
ferc_df.loc[ferc_df.capacity_factor < 0, "capacity_factor"] = np.nan

In [15]:
ferc_df["installation_year"] = pd.to_datetime(ferc_df["installation_year"], format="%Y")
ferc_df["construction_year"] = pd.to_datetime(ferc_df["construction_year"], format="%Y")
eia_df["installation_year"] = pd.to_datetime(eia_df["installation_year"], format="%Y")
eia_df["construction_year"] = pd.to_datetime(eia_df["construction_year"], format="%Y")

### Get training data

In [16]:
pudl_engine = sa.create_engine(pudl.workspace.setup.PudlPaths().pudl_db)

In [17]:
start_year = str(ferc_df.report_year.min())
end_year = str(ferc_df.report_year.max())

In [18]:
train_full = ferc1_eia_match.inputs.InputManager(pudl_engine=pudl_engine, start_report_year=start_year, end_report_year=end_year).get_training_data()

INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.

In [19]:
train_df = train_full[["record_id_ferc1", "record_id_eia"]].rename(columns={"record_id_eia": "record_id_l", "record_id_ferc1": "record_id_r"})
train_df.loc[:, "source_dataset_r"] = "ferc_df"
train_df.loc[:, "source_dataset_l"] = "eia_df"
train_df.loc[:, "clerical_match_score"] = 1 # this column is just a syntax quirk, doesn't mean anything

In [20]:
# create train test split
y_train, y_test = train_test_split(
        train_df, test_size=0.25, random_state=16
    )

### Create settings dict and linker

In [21]:
settings_dict = {"link_type": "link_only",
                 "unique_id_column_name": "record_id",
                 "additional_columns_to_retain": ["plant_id_pudl", "utility_id_pudl"]}

In [55]:
linker = DuckDBLinker([eia_df, ferc_df], input_table_aliases = ["eia_df", "ferc_df"], settings_dict=settings_dict)

In [56]:
train_table = linker.register_table(y_train, "training_labels", overwrite=True)

In [57]:
train_table.as_pandas_dataframe(limit=5)

Unnamed: 0,record_id_r,record_id_l,source_dataset_r,source_dataset_l,clerical_match_score
0,f1_steam_2011_12_108_0_2,2326_st_2011_plant_prime_mover_total_13407,ferc_df,eia_df,1
1,f1_hydro_2013_12_134_3_1,3036_2013_plant_total_14354,ferc_df,eia_df,1
2,f1_steam_2009_12_56_1_2,621_ic_2009_plant_prime_mover_total_6452,ferc_df,eia_df,1
3,f1_steam_2005_12_56_0_4,619_2005_plant_total_6452,ferc_df,eia_df,1
4,f1_gnrt_plant_2011_12_120_0_5,1912_2011_plant_total_13781_retired,ferc_df,eia_df,1


# Data Exploration

In [25]:
linker.completeness_chart(cols=matching_cols)

In [26]:
linker_eia = DuckDBLinker(eia_df)
linker_ferc = DuckDBLinker(ferc_df)

In [27]:
linker_ferc.missingness_chart()

In [28]:
linker_eia.missingness_chart()

Columns with higher cardinality are better for matching
- `fuel_type_code_pudl` might not be the best, high skew in that column too

In [29]:
linker.profile_columns(matching_cols, top_n=10, bottom_n=5)

# Block On `report_year` and `block_num`

`splink` has tools to evaluate more complex blocking rules as well, but since we did blocking a separate step/module, we can just block on `report_year` and `block_num` here.

From the docs: "For linkages in DuckDB on a standard laptop, we suggest using blocking rules that create no more than about 20 million comparisons."

In [30]:
blocking_rule = "l.report_year = r.report_year and l.block_num = r.block_num"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule)
print(f"Number of comparisons generated by '{blocking_rule}': {count:,.0f}")

Number of comparisons generated by 'l.report_year = r.report_year and l.block_num = r.block_num': 1,007,000


Number of comparisons is a little high for the DuckDB linker when only blocking on report year.

# Define Comparisons

[Comparison Template library](https://moj-analytical-services.github.io/splink/comparison_template_library.html)

In [31]:
# try without damerau levenshtein
plant_name_comparison = ctl.name_comparison("plant_name", damerau_levenshtein_thresholds=[], jaro_winkler_thresholds=[.9, .8, .7])
utility_name_comparison = ctl.name_comparison("utility_name", damerau_levenshtein_thresholds=[],jaro_winkler_thresholds=[.9, .8, .7], term_frequency_adjustments=True)

In [32]:
print(plant_name_comparison.human_readable_description)

Comparison 'Exact match vs. Plant_Name within jaro_winkler thresholds 0.9, 0.8, 0.7 vs. anything else' of "plant_name".
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: "plant_name_l" IS NULL OR "plant_name_r" IS NULL
    - 'Exact match plant_name' with SQL rule: "plant_name_l" = "plant_name_r"
    - 'Jaro_winkler_similarity >= 0.9' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.9
    - 'Jaro_winkler_similarity >= 0.8' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.8
    - 'Jaro_winkler_similarity >= 0.7' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.7
    - 'All other comparisons' with SQL rule: ELSE



In [92]:
capacity_comparison = {
    "output_column_name": "capacity_mw",
    "comparison_levels": [
        cll.null_level("capacity_mw"),
        cll.percentage_difference_level("capacity_mw", 0.0 + 1e-4),
        cll.percentage_difference_level("capacity_mw", 0.05),
        cll.percentage_difference_level("capacity_mw", 0.1),
        cll.percentage_difference_level("capacity_mw", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 5% different vs. 10% different vs. 20% different vs. anything else"
}

net_gen_comparison = {
    "output_column_name": "net_generation_mwh",
    "comparison_levels": [
        cll.null_level("net_generation_mwh"),
        cll.percentage_difference_level("net_generation_mwh", 0.0 + 1e-4),  # could add an exact match level too
        cll.percentage_difference_level("net_generation_mwh", 0.01),
        cll.percentage_difference_level("net_generation_mwh", 0.1),
        cll.percentage_difference_level("net_generation_mwh", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else"
}

capacity_factor_comparison = {
    "output_column_name": "capacity_factor",
    "comparison_levels": [
        cll.null_level("capacity_factor"),
        cll.percentage_difference_level("capacity_factor", 0.0 + 1e-4),  # could add an exact match level too
        cll.percentage_difference_level("capacity_factor", 0.05),
        cll.percentage_difference_level("capacity_factor", 0.1),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 5% different vs. 10% different vs. anything else",
}

total_mmbtu_comparison = {
    "output_column_name": "total_mmbtu",
    "comparison_levels": [
        cll.null_level("total_mmbtu"),
        cll.percentage_difference_level("total_mmbtu", 0.0 + 1e-4),  
        cll.percentage_difference_level("total_mmbtu", 0.01),
        cll.percentage_difference_level("total_mmbtu", 0.1),
        cll.percentage_difference_level("total_mmbtu", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else",
}
total_fuel_cost_comparison = {
    "output_column_name": "total_fuel_cost",
    "comparison_levels": [
        cll.null_level("total_fuel_cost"),
        cll.percentage_difference_level("total_fuel_cost", 0.0 + 1e-4),  
        cll.percentage_difference_level("total_fuel_cost", 0.01),
        cll.percentage_difference_level("total_fuel_cost", 0.1),
        cll.percentage_difference_level("total_fuel_cost", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else",
}
heat_rate_comparison = {
    "output_column_name": "heat_rate_mmbtu_mwh",
    "comparison_levels": [
        cll.null_level("heat_rate_mmbtu_mwh"),
        cll.percentage_difference_level("heat_rate_mmbtu_mwh", 0.0 + 1e-4),  
        cll.percentage_difference_level("heat_rate_mmbtu_mwh", 0.01),
        cll.percentage_difference_level("heat_rate_mmbtu_mwh", 0.1),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. anything else",
}
fuel_cost_comparison = {
    "output_column_name": "fuel_cost_per_mmbtu",
    "comparison_levels": [
        cll.null_level("total_fuel_cost"),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.0 + 1e-4),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.01),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.1),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else",
}

In [93]:
print(Comparison(net_gen_comparison).human_readable_description)

Comparison '0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else' of "net_generation_mwh".
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: "net_generation_mwh_l" IS NULL OR "net_generation_mwh_r" IS NULL
    - '< 0.01% diff' with SQL rule: (abs("net_generation_mwh_l" - "net_generation_mwh_r")/
            (case
                when "net_generation_mwh_r" > "net_generation_mwh_l"
                then "net_generation_mwh_r"
                else "net_generation_mwh_l"
            end))
            < 0.0001
    - '< 1.00% diff' with SQL rule: (abs("net_generation_mwh_l" - "net_generation_mwh_r")/
            (case
                when "net_generation_mwh_r" > "net_generation_mwh_l"
                then "net_generation_mwh_r"
                else "net_generation_mwh_l"
            end))
            < 0.01
    - '< 10.00% diff' with SQL rule: (abs("net_generation_mwh_l" - "net_generation_mwh_r")/
            (case
      

In [94]:
def get_date_comparison(column_name):
    return ctl.date_comparison(column_name,
                               # date_format="%Y",
                               damerau_levenshtein_thresholds=[],
                               datediff_thresholds=[1, 2],
                               datediff_metrics=["year", "year"]
                              )

installation_year_comparison = get_date_comparison("installation_year")
construction_year_comparison = get_date_comparison("construction_year")

In [95]:
print(installation_year_comparison.human_readable_description)

Comparison 'Exact match vs. Dates within the following thresholds Year(s): 1, Year(s): 2 vs. anything else' of "installation_year".
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: "installation_year_l" IS NULL OR "installation_year_r" IS NULL
    - 'Exact match' with SQL rule: "installation_year_l" = "installation_year_r"
    - 'Within 1 year' with SQL rule: 
            abs(date_diff('year', "installation_year_l",
              "installation_year_r")) <= 1
        
    - 'Within 2 years' with SQL rule: 
            abs(date_diff('year', "installation_year_l",
              "installation_year_r")) <= 2
        
    - 'All other comparisons' with SQL rule: ELSE



In [96]:
settings_dict.update({
    "comparisons": [
        plant_name_comparison,
        utility_name_comparison,
        construction_year_comparison,
        installation_year_comparison,
        capacity_comparison,
        cl.exact_match("fuel_type_code_pudl", term_frequency_adjustments=True),
        net_gen_comparison,
        capacity_factor_comparison,
        total_mmbtu_comparison,
        total_fuel_cost_comparison,
        heat_rate_comparison,
        # fuel_cost_comparison
    ],
    "blocking_rules_to_generate_predictions": [
        blocking_rule
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "probability_two_random_records_match": 1/len(eia_df)
    }
)

Explanation of probability two random records match calculation:

The EIA dataset has n records and FERC dataset has m records, where n > m. Each FERC record matches to one EIA record, so there are n - m EIA record that don't have a match.

- If I choose a FERC record first then I have a 1/n chance of choosing the matching EIA record
- If I choose an EIA record first then I have a m/n chance of choosing an EIA record that has a FERC match, and then a 1/m chance of choosing the correct matching FERC record. So the probability of choosing two matching records is m/n * 1/m  = 1/n

In either case, the probability is 1/n.

In [97]:
linker.load_settings(settings_dict)

# Estimate Model Parameters

Now that we have specified our linkage model, we need to estimate the probability_two_random_records_match (if not specified in settings dictionary), u, and m parameters.

In [98]:
# try with a much higher probability of two records matching - this seems wrong
deterministic_rules = [
    "jaro_winkler_similarity(l.plant_name, r.plant_name) >= 0.9 and jaro_winkler_similarity(l.utility_name, r.utility_name) >= 0.9"
]

# linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)


In [99]:
%%time
linker.estimate_u_using_random_sampling(max_pairs=1e7)

INFO:splink.estimate_u:----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.estimate_u:
Estimated u probabilities using random sampling
INFO:splink.settings:
Your model is not yet fully trained. Missing estimates for:
    - plant_name (no m values are trained).
    - utility_name (no m values are trained).
    - construction_year (no m values are trained).
    - installation_year (no m values are trained).
    - capacity_mw (no m values are trained).
    - fuel_type_code_pudl (no m values are trained).
    - net_generation_mwh (no m values are trained).
    - capacity_factor (no m values are trained).
    - total_mmbtu (no m values are trained).
    - total_fuel_cost (no m values are trained).
    - heat_rate_mmbtu_mwh (no m values are trained).


CPU times: user 35.8 s, sys: 1.52 s, total: 37.3 s
Wall time: 29.7 s


We can estimate m with either training labels or unsupervised, with Expectation Maximization.

In [54]:
linker.estimate_m_from_pairwise_labels("training_labels")

INFO:splink.m_u_records_to_parameters:m probability not trained for net_generation_mwh - kwh_to_mwh_conversion (comparison vector value: 1). This usually means the comparison level was never observed in the training data.


In [100]:
training_blocking_rule_1 = "l.plant_name = r.plant_name"
training_session_1 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule_1)

INFO:splink.em_training_session:
----- Starting EM training session -----

INFO:splink.em_training_session:Estimating the m probabilities of the model by blocking on:
l.plant_name = r.plant_name

Parameter estimates will be made for the following comparison(s):
    - utility_name
    - construction_year
    - installation_year
    - capacity_mw
    - fuel_type_code_pudl
    - net_generation_mwh
    - capacity_factor
    - total_mmbtu
    - total_fuel_cost
    - heat_rate_mmbtu_mwh

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - plant_name


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 1: Largest change in params was -0.949 in the m_probability of total_fuel_cost, level `< 0.01% diff`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 2: Largest change in params was 0.592 in the m_probability of total_mmbtu, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 3: Largest change in params was 0.245 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 4: Largest change in params was 0.0975 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 5: Largest change in params was 0.0399 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 6: Largest change in params was 0.018 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 7: Largest change in params was 0.00866 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 8: Largest change in params was 0.00432 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 9: Largest change in params was 0.0022 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 10: Largest change in params was 0.00112 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 11: Largest change in params was 0.000579 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 12: Largest change in params was 0.000299 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 13: Largest change in params was 0.000155 in probability_two_random_records_match


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 14: Largest change in params was 8e-05 in probability_two_random_records_match
INFO:splink.expectation_maximisation:
EM converged after 14 iterations
INFO:splink.settings:
Your model is not yet fully trained. Missing estimates for:
    - plant_name (no m values are trained).


In [101]:
training_session_2 = linker.estimate_parameters_using_expectation_maximisation(block_on(["utility_name", "net_generation_mwh"]))

INFO:splink.em_training_session:
----- Starting EM training session -----

INFO:splink.em_training_session:Estimating the m probabilities of the model by blocking on:
(l."utility_name" = r."utility_name") AND (l."net_generation_mwh" = r."net_generation_mwh")

Parameter estimates will be made for the following comparison(s):
    - plant_name
    - construction_year
    - installation_year
    - capacity_mw
    - fuel_type_code_pudl
    - capacity_factor
    - total_mmbtu
    - total_fuel_cost
    - heat_rate_mmbtu_mwh

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - utility_name
    - net_generation_mwh
INFO:splink.expectation_maximisation:
Level < 0.01% diff on comparison total_fuel_cost not observed in dataset, unable to train m value

INFO:splink.expectation_maximisation:Iteration 1: Largest change in params was 0.95 in the m_probability of total_fuel_cost, level `< 0.01% diff`
INFO:splink.expectation_maximisation:I

In [102]:
from splink.duckdb.blocking_rule_library import block_on
training_session_3 = linker.estimate_parameters_using_expectation_maximisation(block_on(["capacity_mw", "fuel_type_code_pudl"]))

INFO:splink.em_training_session:
----- Starting EM training session -----

INFO:splink.em_training_session:Estimating the m probabilities of the model by blocking on:
(l."capacity_mw" = r."capacity_mw") AND (l."fuel_type_code_pudl" = r."fuel_type_code_pudl")

Parameter estimates will be made for the following comparison(s):
    - plant_name
    - utility_name
    - construction_year
    - installation_year
    - net_generation_mwh
    - capacity_factor
    - total_mmbtu
    - total_fuel_cost
    - heat_rate_mmbtu_mwh

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - capacity_mw
    - fuel_type_code_pudl


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 1: Largest change in params was -0.328 in the m_probability of net_generation_mwh, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 2: Largest change in params was 0.184 in the m_probability of plant_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 3: Largest change in params was 0.166 in the m_probability of plant_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 4: Largest change in params was 0.0916 in the m_probability of plant_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 5: Largest change in params was 0.0522 in the m_probability of utility_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 6: Largest change in params was 0.0282 in the m_probability of utility_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 7: Largest change in params was 0.0157 in the m_probability of utility_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 8: Largest change in params was 0.00858 in the m_probability of utility_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 9: Largest change in params was 0.00449 in the m_probability of utility_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 10: Largest change in params was 0.0023 in the m_probability of utility_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 11: Largest change in params was 0.00118 in the m_probability of utility_name, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 12: Largest change in params was -0.000617 in the m_probability of construction_year, level `Exact match`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 13: Largest change in params was 0.000354 in the m_probability of construction_year, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 14: Largest change in params was 0.000208 in the m_probability of construction_year, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 15: Largest change in params was 0.000121 in the m_probability of construction_year, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.expectation_maximisation:Iteration 16: Largest change in params was 6.95e-05 in the m_probability of construction_year, level `All other comparisons`
INFO:splink.expectation_maximisation:
EM converged after 16 iterations
INFO:splink.settings:
Your model is fully trained. All comparisons have at least one estimate for their m and u values


In [103]:
linker.match_weights_chart()

In [104]:
linker.m_u_parameters_chart()

In [None]:
# reads like: "a match threshold of 70% will include 94% of records"
linker.unlinkables_chart()

In [105]:
n = "unsupervised"

In [106]:
settings = linker.save_model_to_json(f"./splink_model_settings/model_settings_{n}.json", overwrite=True)

# Make Predictions

In [144]:
df_preds = linker.predict(threshold_match_probability=0.5)
# df_preds = linker.predict(threshold_match_weight=0)
# df_preds = linker.predict()

In [145]:
len(df_preds.as_pandas_dataframe())

87490

In [146]:
sorted_preds_df = df_preds.as_pandas_dataframe().sort_values(by="match_probability", ascending=False)

In [147]:
sorted_preds_df.head(3)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,plant_name_l,plant_name_r,gamma_plant_name,bf_plant_name,utility_name_l,utility_name_r,gamma_utility_name,tf_utility_name_l,tf_utility_name_r,bf_utility_name,bf_tf_adj_utility_name,construction_year_l,construction_year_r,gamma_construction_year,bf_construction_year,installation_year_l,installation_year_r,gamma_installation_year,bf_installation_year,capacity_mw_l,capacity_mw_r,gamma_capacity_mw,bf_capacity_mw,fuel_type_code_pudl_l,fuel_type_code_pudl_r,gamma_fuel_type_code_pudl,tf_fuel_type_code_pudl_l,tf_fuel_type_code_pudl_r,bf_fuel_type_code_pudl,bf_tf_adj_fuel_type_code_pudl,net_generation_mwh_l,net_generation_mwh_r,gamma_net_generation_mwh,bf_net_generation_mwh,capacity_factor_l,capacity_factor_r,gamma_capacity_factor,bf_capacity_factor,total_mmbtu_l,total_mmbtu_r,gamma_total_mmbtu,bf_total_mmbtu,total_fuel_cost_l,total_fuel_cost_r,gamma_total_fuel_cost,bf_total_fuel_cost,heat_rate_mmbtu_mwh_l,heat_rate_mmbtu_mwh_r,gamma_heat_rate_mmbtu_mwh,bf_heat_rate_mmbtu_mwh,plant_id_pudl_l,plant_id_pudl_r,utility_id_pudl_l,utility_id_pudl_r,block_num_l,block_num_r,report_year_l,report_year_r
78339,41.967055,1.0,eia_df,ferc_df,2876_2017_plant_total_14015,f1_steam_2017_12_128_0_1,kyger creek,kyger creek,4,916.78608,ohio valley electric corporation,ohio valley electric corporation,4,0.000307,0.000307,37.981468,38.776098,1955-01-01,1955-01-01,3,48.288857,1955-01-01,1955-01-01,3,33.943948,1086.5,1086.3,3,18.640605,coal,coal,1,0.229308,0.229308,3.931635,1.01253,5899969.01,5899936.0,4,441.172839,0.62,0.62,3,8.182294,61975315.0,61979600.0,4,189.392708,115169500.0,115456200.0,3,6.44967,10.518016,10.505132,2,5.977678,306,306,236,236,4024,4024,2017,2017
81084,41.128491,1.0,eia_df,ferc_df,6139_2013_plant_total_17698,f1_steam_2013_12_164_1_1,welsh,welsh,4,916.78608,southwestern electric power company,southwestern electric power company,4,0.004596,0.004596,37.981468,2.586147,1977-01-01,1977-01-01,3,48.288857,1982-01-01,1982-01-01,3,33.943948,1674.0,1674.0,4,177.081689,coal,coal,1,0.229308,0.229308,3.931635,1.01253,8752974.01,8753593.0,4,441.172839,0.6,0.6,3,8.182294,96273274.0,96268330.0,4,189.392708,213754400.0,209244200.0,2,5.692482,10.999068,10.997579,2,5.977678,620,620,301,301,3483,3483,2013,2013
81043,41.128491,1.0,eia_df,ferc_df,6139_2014_plant_total_17698,f1_steam_2014_12_164_1_1,welsh,welsh,4,916.78608,southwestern electric power company,southwestern electric power company,4,0.004596,0.004596,37.981468,2.586147,1977-01-01,1977-01-01,3,48.288857,1982-01-01,1982-01-01,3,33.943948,1674.0,1674.0,4,177.081689,coal,coal,1,0.229308,0.229308,3.931635,1.01253,7982240.0,7982240.0,4,441.172839,0.54,0.54,3,8.182294,88442388.0,88449800.0,4,189.392708,196015800.0,199051100.0,2,5.692482,11.085574,11.080824,2,5.977678,620,620,301,301,3484,3484,2014,2014


In [148]:
one_to_one_preds = sorted_preds_df.groupby("record_id_r").first()

In [149]:
matching_cols_used = ["plant_name",
                     "utility_name",
                     "fuel_type_code_pudl",
                     "installation_year",
                     "construction_year",
                     "capacity_mw",
                     "net_generation_mwh",
                     "capacity_factor",
                     "total_mmbtu",
                     "total_fuel_cost",
                     "heat_rate_mmbtu_mwh",
                     # "fuel_cost_per_mmbtu"
                    ]

In [150]:
cols = [col + "_l" for col in matching_cols_used]
cols += [col + "_r" for col in matching_cols_used]
extra_cols = ["plant_id_pudl_l", "plant_id_pudl_r", "utility_id_pudl_l", "utility_id_pudl_r"]
cols.sort()
cols = ["record_id_l", "match_weight", "match_probability"] + cols + extra_cols
one_to_one_preds = one_to_one_preds[cols].reset_index()

In [151]:
test_labels_df = train_df.copy()
# test_labels_df = y_test.copy()

In [152]:
n_test_records = len(test_labels_df)

In [153]:
# how many FERC records had matches above the match threshold
predicted_test_matches = test_labels_df.merge(
    one_to_one_preds,
    how="left",
    on=["record_id_r"],
    indicator=True,
    suffixes=("_true", "_pred"))

In [154]:
# how many FERC train records had matches above the match threshold
predicted_test_matches._merge.value_counts()

_merge
both          8245
left_only     1042
right_only       0
Name: count, dtype: int64

In [155]:
# how many FERC train records were correctly matched
correct_filter = (predicted_test_matches.record_id_l_true == predicted_test_matches.record_id_l_pred)
correct_matches = predicted_test_matches[correct_filter]
len(correct_matches)/n_test_records

0.829977387746312

In [156]:
sorted_preds_df.to_parquet("unsupervised_preds_match_threshold_.5.parquet")

## Look at incorrect matches

In [527]:
incorrect_matches = predicted_test_matches[~correct_filter][["record_id_r", "record_id_l_true", "record_id_l_pred", "match_weight", "match_probability"]]
incorrect_matches

Unnamed: 0,record_id_r,record_id_l_true,record_id_l_pred,match_weight,match_probability
14,f1_steam_2017_12_17_1_2,2713_2017_plant_total_3046,58212_11_2017_plant_gen_total_5416_retired,-21.215227,4.107524e-07
21,f1_steam_2019_12_454_3_2,6462_2019_plant_total_11241,6595_1_2019_plant_gen_total_16524,-30.328579,7.416312e-10
37,f1_steam_2010_12_17_4_1,2706_gt_2010_plant_prime_mover_total_3046,2706_2010_plant_total_3046,-14.187214,5.360430e-05
40,f1_steam_2020_12_56_0_1,6045_2020_plant_owned_6452,6045_2020_plant_total_6452,22.586408,9.999998e-01
48,f1_steam_2006_12_57_6_4,6124_ct1_2006_plant_gen_total_7140,56150_10st_2006_plant_gen_total_7140,-10.003852,9.730111e-04
...,...,...,...,...,...
2276,f1_steam_2015_12_56_2_1,6045_2015_plant_owned_6452,6045_2015_plant_total_6452,19.418282,9.999986e-01
2280,f1_steam_2014_12_159_4_5,3298_2_2014_plant_gen_total_17554,3298_1_2014_plant_gen_total_17554,13.435007,9.999097e-01
2297,f1_gnrt_plant_2008_12_144_0_3,1010_ic_2008_plant_prime_mover_total_15470,1010_2008_plant_owned_40211,-20.073615,9.062321e-07
2311,f1_steam_2019_12_182_1_3,2098_1_2019_plant_unit_total_56211,2098_gt_2019_plant_prime_mover_total_56211,-7.331470,6.170487e-03


In [528]:
i = 3

In [529]:
incorrect_matches.iloc[i].record_id_r, incorrect_matches.iloc[i].record_id_l_true, incorrect_matches.iloc[i].record_id_l_pred

('f1_steam_2020_12_56_0_1',
 '6045_2020_plant_owned_6452',
 '6045_2020_plant_total_6452')

In [530]:
rec_true = sorted_preds_df[(sorted_preds_df.record_id_r == incorrect_matches.iloc[i].record_id_r) & (sorted_preds_df.record_id_l == incorrect_matches.iloc[i].record_id_l_true)]
rec_true = rec_true.to_dict(orient="records")
linker.waterfall_chart(rec_true, filter_nulls=False)

In [531]:
rec_pred = sorted_preds_df[(sorted_preds_df.record_id_r == incorrect_matches.iloc[i].record_id_r) & (sorted_preds_df.record_id_l == incorrect_matches.iloc[i].record_id_l_pred)]
rec_pred = rec_pred.to_dict(orient="records")
linker.waterfall_chart(rec_pred, filter_nulls=False)

In [532]:
# are there any notes for that match?
train_full[train_full.record_id_ferc1 == incorrect_matches.iloc[i].record_id_r].iloc[0].notes

'capacity matches total, the generation is closer to owned'

In [47]:
train_full[train_full.notes.notnull()].notes.value_counts().head(10)

notes
ferc mwh vs kwh issue with net gen                                                                                                                                                                                                                            365
ferc units off                                                                                                                                                                                                                                                 89
In 2020 FERC This record swaps from South Carolina Electric & Gas Company (pudl id 292) to South Carolina Generating Company, Inc. (pudl id 293) I think to reflect Dominion buying it in 2019? EIA always reports it as 293 so it looks wrong until 2020.     61
wrong utility, right plant                                                                                                                                                                                                  

Do `utility_id_pudl` and `plant_id_pudl` generally match up?

In [382]:
consistent_id_df = one_to_one_preds.dropna(subset=["utility_id_pudl_l",
                                                   "utility_id_pudl_r",
                                                   "plant_id_pudl_l",
                                                   "plant_id_pudl_r"
                                                  ])

In [383]:
(consistent_id_df.plant_id_pudl_l == consistent_id_df.plant_id_pudl_r).value_counts()

True     32131
False     8149
Name: count, dtype: int64

In [384]:
(consistent_id_df.utility_id_pudl_l == consistent_id_df.utility_id_pudl_r).value_counts()

True     34658
False     5622
Name: count, dtype: int64