In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_level_library as cll
import splink.duckdb.comparison_template_library as ctl
from splink.comparison import Comparison
import sqlalchemy as sa
import pudl
from pudl.etl import defs
from pudl.analysis.record_linkage import embed_dataframe

import ferc1_eia_match

In [7]:
pudl_engine = sa.create_engine(pudl.workspace.setup.PudlPaths().pudl_db)

# Setup from PUDL

In [3]:
out_ferc1__yearly_all_plants = defs.load_asset_value("out_ferc1__yearly_all_plants")
out_ferc1__yearly_steam_plants_fuel_by_plant_sched402 = defs.load_asset_value("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402")
out_eia__yearly_plant_parts = defs.load_asset_value("out_eia__yearly_plant_parts")

INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.

In [4]:
inputs = pudl.analysis.record_linkage.eia_ferc1_record_linkage.InputManager(out_ferc1__yearly_all_plants,
                                                                            out_ferc1__yearly_steam_plants_fuel_by_plant_sched402,
                                                                            out_eia__yearly_plant_parts
                                                                           )
inputs.execute()

2024-01-25 17:59:52 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:410 Preparing the FERC1 tables.
INFO:catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:Preparing the FERC1 tables.
2024-01-25 18:01:12 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:818 Updating 77 training records with 1:m plant parts.
INFO:catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:Updating 77 training records with 1:m plant parts.
2024-01-25 18:01:12 [    INFO] catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:715 Restricting training data on years: 2001|2002|2003|2004|2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020|2021|2022|2023
INFO:catalystcoop.pudl.analysis.record_linkage.eia_ferc1_record_linkage:Restricting training data on years: 2001|2002|2003|2004|2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020|2021|2022|2023


In [5]:
ferc_full = inputs.get_plants_ferc1()
eia_full = inputs.get_plant_parts_eia_true()

In [8]:
# all very temp
cleaners = ferc1_eia_match.inputs.InputManager(pudl_engine)
ferc_full = cleaners.utility_name_cleaner.get_clean_df(ferc_full, "utility_name_ferc1", "utility_name")
ferc_full = cleaners.plant_name_cleaner.get_clean_df(ferc_full, "plant_name_ferc1", "plant_name")
eia_full = cleaners.utility_name_cleaner.get_clean_df(eia_full, "utility_name_eia", "utility_name")
eia_full = cleaners.plant_name_cleaner.get_clean_df(eia_full, "plant_name_eia", "plant_name")



In [9]:
eia_full = cleaners.fill_fuel_type_from_name(eia_full)
ferc_full = cleaners.fill_fuel_type_from_name(ferc_full)

# Setup

Read in FERC1 and EIA inputs (output of candidate set creation set). 

In [3]:
k = 25

In [4]:
eia_full = pd.read_parquet(f"inputs/eia_candidates_12_22_k_{k}.parquet")
ferc_full = pd.read_parquet(f"inputs/ferc_candidates_12_22_k_{k}.parquet")

In [5]:
eia_full = pd.concat([eia_full, pd.read_parquet(f"inputs/eia_candidates_01_11_k_{k}.parquet")])
ferc_full = pd.concat([ferc_full, pd.read_parquet(f"inputs/ferc_candidates_01_11_k_{k}.parquet")])

In [6]:
eia_full = eia_full.set_index("record_id_eia")
ferc_full = ferc_full.set_index("record_id_ferc1")

In [7]:
shared_cols = list(set(eia_full.columns) & set(ferc_full.columns))

In [8]:
eia_full[shared_cols].isnull().sum().sort_values()

utility_name                0
plant_name                  0
report_year                 0
utility_id_pudl             0
plant_id_pudl               0
block_num                   0
capacity_mw             14160
construction_year       56749
installation_year       56749
net_generation_mwh     169322
capacity_factor        169713
fuel_type_code_pudl    193016
heat_rate_mmbtu_mwh    801316
total_mmbtu            801973
fuel_cost_per_mmbtu    811564
total_fuel_cost        817116
fuel_cost_per_mwh      823063
dtype: int64

In [9]:
ferc_full[shared_cols].isnull().sum().sort_values()

utility_name               0
plant_name                 0
report_year                0
utility_id_pudl            0
plant_id_pudl              0
block_num                  0
construction_year       1968
capacity_mw             2276
net_generation_mwh      4320
installation_year      13830
capacity_factor        14609
fuel_type_code_pudl    18609
fuel_cost_per_mwh      21546
total_fuel_cost        22159
total_mmbtu            22647
heat_rate_mmbtu_mwh    22957
fuel_cost_per_mmbtu    23229
dtype: int64

In [13]:
def revert_nulls_custom_cols(df: pd.DataFrame, column_names: list[str], null_value = 0):
    df.loc[:, column_names] = df.loc[:, column_names].replace(null_value, np.nan)
    return df

In [14]:
# temp, fix this
eia_df = revert_nulls_custom_cols(eia_full, column_names=["net_generation_mwh", "capacity_factor"], null_value=0.0)
ferc_df = revert_nulls_custom_cols(ferc_full, column_names=["net_generation_mwh", "capacity_factor"], null_value=0.0)
eia_df.loc[eia_df.net_generation_mwh < 0, "net_generation_mwh"] = np.nan
ferc_df.loc[ferc_df.net_generation_mwh < 0, "net_generation_mwh"] = np.nan
eia_df.loc[eia_df.capacity_factor < 0, "capacity_factor"] = np.nan
ferc_df.loc[ferc_df.capacity_factor < 0, "capacity_factor"] = np.nan

# Splink cleaning and standardizing

In [10]:
# not all used
matching_cols = ["plant_name",
                 "utility_name",
                 "fuel_type_code_pudl",
                 "installation_year",
                 "construction_year",
                 "capacity_mw",
                 "net_generation_mwh",
                 "capacity_factor",
                 "total_mmbtu",
                 "total_fuel_cost",
                 "unit_heat_rate_mmbtu_per_mwh",
                 "fuel_cost_per_mmbtu"
                ]
# retain these columns either for blocking or validation, not going to match with these
extra_cols = ["plant_id_pudl", "utility_id_pudl", "report_year"]
# extra_cols += ["block_num"]

In [11]:
ferc_df = ferc_full[matching_cols + extra_cols].reset_index().rename(columns={"record_id_ferc1": "record_id"})
eia_df = eia_full[matching_cols + extra_cols].reset_index().rename(columns={"record_id_eia": "record_id"})

In [9]:
# I think skip for now
eia_df["net_generation_mwh"] = eia_df["net_generation_mwh"].round(2)
ferc_df["net_generation_mwh"] = ferc_df["net_generation_mwh"].round(2)
eia_df["capacity_factor"] = eia_df["capacity_factor"].round(2)
ferc_df["capacity_factor"] = ferc_df["capacity_factor"].round(2)

In [12]:
ferc_df["installation_year"] = pd.to_datetime(ferc_df["installation_year"], format="%Y")
ferc_df["construction_year"] = pd.to_datetime(ferc_df["construction_year"], format="%Y")
eia_df["installation_year"] = pd.to_datetime(eia_df["installation_year"], format="%Y")
eia_df["construction_year"] = pd.to_datetime(eia_df["construction_year"], format="%Y")

### Get training data

In [13]:
start_year = str(ferc_df.report_year.min())
end_year = str(ferc_df.report_year.max())

In [14]:
train_full = ferc1_eia_match.inputs.InputManager(pudl_engine=pudl_engine, start_report_year=start_year, end_report_year=end_year).get_training_data()

INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.
INFO:alembic.runtime.migration:Running stamp_revision  -> 46b412388816
INFO:alembic.runtime.migration:Context impl SQLiteImpl.

In [15]:
train_df = train_full[["record_id_ferc1", "record_id_eia"]].rename(columns={"record_id_eia": "record_id_l", "record_id_ferc1": "record_id_r"})
train_df.loc[:, "source_dataset_r"] = "ferc_df"
train_df.loc[:, "source_dataset_l"] = "eia_df"
train_df.loc[:, "clerical_match_score"] = 1 # this column shows that all these labels are positive labels

In [17]:
# create train test split
y_train, y_test = train_test_split(
        train_df, test_size=0.2, random_state=16
    )

### Create settings dict and linker

In [16]:
settings_dict = {"link_type": "link_only",
                 "unique_id_column_name": "record_id",
                 "additional_columns_to_retain": ["plant_id_pudl", "utility_id_pudl"]}

In [17]:
linker = DuckDBLinker([eia_df, ferc_df], input_table_aliases = ["eia_df", "ferc_df"], settings_dict=settings_dict)

In [18]:
train_table = linker.register_table(train_df, "training_labels", overwrite=True)

In [21]:
train_table.as_pandas_dataframe(limit=5)

Unnamed: 0,record_id_r,record_id_l,source_dataset_r,source_dataset_l,clerical_match_score
0,f1_hydro_2018_12_177_0_2,1109_2018_plant_total_19436,ferc_df,eia_df,1
1,f1_steam_2018_12_7_0_1,113_1_2018_plant_unit_total_803,ferc_df,eia_df,1
2,f1_steam_2018_12_7_0_2,113_3_2018_plant_unit_total_803,ferc_df,eia_df,1
3,f1_steam_2018_12_7_2_4,114_2018_plant_total_803,ferc_df,eia_df,1
4,f1_steam_2018_12_7_0_5,116_1_2018_plant_unit_total_803,ferc_df,eia_df,1


# Data Exploration

In [49]:
linker.completeness_chart(cols=matching_cols)

In [51]:
linker_eia = DuckDBLinker(eia_df)
linker_ferc = DuckDBLinker(ferc_df)

In [52]:
linker_ferc.missingness_chart()

In [53]:
linker_eia.missingness_chart()

Columns with higher cardinality are better for matching
- `fuel_type_code_pudl` might not be the best, high skew in that column too

In [29]:
linker.profile_columns(matching_cols, top_n=10, bottom_n=5)

# Generate blocking rules

"More generally, we can often specify multiple blocking rules such that it becomes highly implausible that a true match would not meet at least one of these blocking critera. This is the recommended approach in Splink. Generally we would recommend between about 3 and 10, though even more is possible."

In [56]:
blocking_rule_1 = block_on("plant_name")
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_1)
print(f"Number of comparisons generated by '{blocking_rule_1.blocking_rule_sql}': {count:,.0f}")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of comparisons generated by 'l."plant_name" = r."plant_name"': 5,130,640


In [107]:
blocking_rule_1 = "l.report_year = r.report_year and jaro_winkler_similarity(l.plant_name, r.plant_name) >= .7"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_1)
print(f"Number of comparisons generated by report year and plant name sim: {count:,.0f}")

blocking_rule_2 = "l.report_year = r.report_year and jaro_winkler_similarity(l.plant_name, r.plant_name) >= .5 and l.utility_name = r.utility_name"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_2)
print(f"Number of comparisons generated by plant name sim and utility match: {count:,.0f}")

blocking_rule_3 = "l.report_year = r.report_year and jaro_winkler_similarity(l.utility_name, r.utility_name) >= .7 and l.installation_year = r.installation_year"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_3)
print(f"Number of comparisons generated by utility name sim and installation year match: {count:,.0f}")

blocking_rule_4 = block_on(["report_year", "fuel_type_code_pudl", "capacity_mw"])
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_4)
print(f"Number of comparisons generated by {blocking_rule_4.blocking_rule_sql}: {count:,.0f}")
                            
blocking_rule_5 = "l.report_year = r.report_year and jaro_winkler_similarity(l.plant_name, r.plant_name) >= .5 and l.net_generation_mwh = r.net_generation_mwh"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_5)
print(f"Number of comparisons generated by plant name sim and matching net gen: {count:,.0f}")                       

blocking_rule_6 = "l.report_year = r.report_year and jaro_winkler_similarity(l.utility_name, r.utility_name) >= .7 and l.construction_year = r.construction_year"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_6)
print(f"Number of comparisons generated by utility name sim and construction year match: {count:,.0f}")

blocking_rule_7 = "l.report_year = r.report_year and jaro_winkler_similarity(l.plant_name, r.plant_name) >= .5 and l.capacity_mw = r.capacity_mw"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_7)
print(f"Number of comparisons generated by plant name sim and matching capacity: {count:,.0f}")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of comparisons generated by report year and plant name sim: 6,610,083


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of comparisons generated by plant name sim and utility match: 909,830


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of comparisons generated by utility name sim and installation year match: 905,877
Number of comparisons generated by (l."report_year" = r."report_year") AND (l."fuel_type_code_pudl" = r."fuel_type_code_pudl") AND (l."capacity_mw" = r."capacity_mw"): 267,812
Number of comparisons generated by plant name sim and matching net gen: 38,375


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of comparisons generated by utility name sim and construction year match: 801,308


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of comparisons generated by plant name sim and matching capacity: 1,706,404


In [108]:
blocking_rules = [blocking_rule_1, blocking_rule_2, blocking_rule_3, blocking_rule_4, blocking_rule_5, blocking_rule_6, blocking_rule_7]
linker.cumulative_num_comparisons_from_blocking_rules_chart(blocking_rules)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

# Block On `report_year` and `block_num`

`splink` has tools to evaluate more complex blocking rules as well, but since we did blocking a separate step/module, we can just block on `report_year` and `block_num` here.

From the docs: "For linkages in DuckDB on a standard laptop, we suggest using blocking rules that create no more than about 20 million comparisons."

In [30]:
blocking_rule = "l.report_year = r.report_year and l.block_num = r.block_num"
count = linker.count_num_comparisons_from_blocking_rule(blocking_rule)
print(f"Number of comparisons generated by '{blocking_rule}': {count:,.0f}")

Number of comparisons generated by 'l.report_year = r.report_year and l.block_num = r.block_num': 1,007,000


Number of comparisons is a little high for the DuckDB linker when only blocking on report year.

# Define Comparisons

[Comparison Template library](https://moj-analytical-services.github.io/splink/comparison_template_library.html)

In [35]:
# try without damerau levenshtein
plant_name_comparison = ctl.name_comparison("plant_name", damerau_levenshtein_thresholds=[], jaro_winkler_thresholds=[.9, .8, .7])
utility_name_comparison = ctl.name_comparison("utility_name", damerau_levenshtein_thresholds=[],jaro_winkler_thresholds=[.9, .8, .7], term_frequency_adjustments=True)

In [36]:
print(plant_name_comparison.human_readable_description)

Comparison 'Exact match vs. Plant_Name within jaro_winkler thresholds 0.9, 0.8, 0.7 vs. anything else' of "plant_name".
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: "plant_name_l" IS NULL OR "plant_name_r" IS NULL
    - 'Exact match plant_name' with SQL rule: "plant_name_l" = "plant_name_r"
    - 'Jaro_winkler_similarity >= 0.9' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.9
    - 'Jaro_winkler_similarity >= 0.8' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.8
    - 'Jaro_winkler_similarity >= 0.7' with SQL rule: jaro_winkler_similarity("plant_name_l", "plant_name_r") >= 0.7
    - 'All other comparisons' with SQL rule: ELSE



In [37]:
capacity_comparison = {
    "output_column_name": "capacity_mw",
    "comparison_levels": [
        cll.null_level("capacity_mw"),
        cll.percentage_difference_level("capacity_mw", 0.0 + 1e-4),
        cll.percentage_difference_level("capacity_mw", 0.05),
        cll.percentage_difference_level("capacity_mw", 0.1),
        cll.percentage_difference_level("capacity_mw", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 5% different vs. 10% different vs. 20% different vs. anything else"
}

net_gen_comparison = {
    "output_column_name": "net_generation_mwh",
    "comparison_levels": [
        cll.null_level("net_generation_mwh"),
        cll.percentage_difference_level("net_generation_mwh", 0.0 + 1e-4),  # could add an exact match level too
        cll.percentage_difference_level("net_generation_mwh", 0.01),
        cll.percentage_difference_level("net_generation_mwh", 0.1),
        cll.percentage_difference_level("net_generation_mwh", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else"
}

capacity_factor_comparison = {
    "output_column_name": "capacity_factor",
    "comparison_levels": [
        cll.null_level("capacity_factor"),
        cll.percentage_difference_level("capacity_factor", 0.0 + 1e-4),  # could add an exact match level too
        cll.percentage_difference_level("capacity_factor", 0.05),
        cll.percentage_difference_level("capacity_factor", 0.1),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 5% different vs. 10% different vs. anything else",
}

total_mmbtu_comparison = {
    "output_column_name": "total_mmbtu",
    "comparison_levels": [
        cll.null_level("total_mmbtu"),
        cll.percentage_difference_level("total_mmbtu", 0.0 + 1e-4),  
        cll.percentage_difference_level("total_mmbtu", 0.01),
        cll.percentage_difference_level("total_mmbtu", 0.1),
        cll.percentage_difference_level("total_mmbtu", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else",
}
total_fuel_cost_comparison = {
    "output_column_name": "total_fuel_cost",
    "comparison_levels": [
        cll.null_level("total_fuel_cost"),
        cll.percentage_difference_level("total_fuel_cost", 0.0 + 1e-4),  
        cll.percentage_difference_level("total_fuel_cost", 0.01),
        cll.percentage_difference_level("total_fuel_cost", 0.1),
        cll.percentage_difference_level("total_fuel_cost", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else",
}
heat_rate_comparison = {
    "output_column_name": "heat_rate_mmbtu_mwh",
    "comparison_levels": [
        cll.null_level("heat_rate_mmbtu_mwh"),
        cll.percentage_difference_level("heat_rate_mmbtu_mwh", 0.0 + 1e-4),  
        cll.percentage_difference_level("heat_rate_mmbtu_mwh", 0.01),
        cll.percentage_difference_level("heat_rate_mmbtu_mwh", 0.1),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. anything else",
}
fuel_cost_comparison = {
    "output_column_name": "fuel_cost_per_mmbtu",
    "comparison_levels": [
        cll.null_level("total_fuel_cost"),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.0 + 1e-4),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.01),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.1),
        cll.percentage_difference_level("fuel_cost_per_mmbtu", 0.2),
        cll.else_level(),
    ],
    "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else",
}

In [38]:
print(Comparison(net_gen_comparison).human_readable_description)

Comparison '0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else' of "net_generation_mwh".
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: "net_generation_mwh_l" IS NULL OR "net_generation_mwh_r" IS NULL
    - '< 0.01% diff' with SQL rule: (abs("net_generation_mwh_l" - "net_generation_mwh_r")/
            (case
                when "net_generation_mwh_r" > "net_generation_mwh_l"
                then "net_generation_mwh_r"
                else "net_generation_mwh_l"
            end))
            < 0.0001
    - '< 1.00% diff' with SQL rule: (abs("net_generation_mwh_l" - "net_generation_mwh_r")/
            (case
                when "net_generation_mwh_r" > "net_generation_mwh_l"
                then "net_generation_mwh_r"
                else "net_generation_mwh_l"
            end))
            < 0.01
    - '< 10.00% diff' with SQL rule: (abs("net_generation_mwh_l" - "net_generation_mwh_r")/
            (case
      

In [39]:
def get_date_comparison(column_name):
    return ctl.date_comparison(column_name,
                               # date_format="%Y",
                               damerau_levenshtein_thresholds=[],
                               datediff_thresholds=[1, 2],
                               datediff_metrics=["year", "year"]
                              )

installation_year_comparison = get_date_comparison("installation_year")
construction_year_comparison = get_date_comparison("construction_year")

In [40]:
print(installation_year_comparison.human_readable_description)

Comparison 'Exact match vs. Dates within the following thresholds Year(s): 1, Year(s): 2 vs. anything else' of "installation_year".
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: "installation_year_l" IS NULL OR "installation_year_r" IS NULL
    - 'Exact match' with SQL rule: "installation_year_l" = "installation_year_r"
    - 'Within 1 year' with SQL rule: 
            abs(date_diff('year', "installation_year_l",
              "installation_year_r")) <= 1
        
    - 'Within 2 years' with SQL rule: 
            abs(date_diff('year', "installation_year_l",
              "installation_year_r")) <= 2
        
    - 'All other comparisons' with SQL rule: ELSE



In [109]:
settings_dict.update({
    "comparisons": [
        plant_name_comparison,
        utility_name_comparison,
        construction_year_comparison,
        installation_year_comparison,
        capacity_comparison,
        cl.exact_match("fuel_type_code_pudl", term_frequency_adjustments=True),
        net_gen_comparison,
        # capacity_factor_comparison,
        # total_mmbtu_comparison,
        # total_fuel_cost_comparison,
        # heat_rate_comparison,
        # fuel_cost_comparison
    ],
    "blocking_rules_to_generate_predictions": [
        blocking_rule_1, blocking_rule_2, blocking_rule_3, blocking_rule_4, blocking_rule_5, blocking_rule_6, blocking_rule_7
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "probability_two_random_records_match": 1/len(eia_df)
    }
)

Explanation of probability two random records match calculation:

The EIA dataset has n records and FERC dataset has m records, where n > m. Each FERC record matches to one EIA record, so there are n - m EIA record that don't have a match.

- If I choose a FERC record first then I have a 1/n chance of choosing the matching EIA record
- If I choose an EIA record first then I have a m/n chance of choosing an EIA record that has a FERC match, and then a 1/m chance of choosing the correct matching FERC record. So the probability of choosing two matching records is m/n * 1/m  = 1/n

In either case, the probability is 1/n.

In [110]:
linker.load_settings(settings_dict)

# Estimate Model Parameters

Now that we have specified our linkage model, we need to estimate the probability_two_random_records_match (if not specified in settings dictionary), u, and m parameters.

In [326]:
# try with a much higher probability of two records matching - this seems wrong
deterministic_rules = [
    "jaro_winkler_similarity(l.plant_name, r.plant_name) >= 0.9 and jaro_winkler_similarity(l.utility_name, r.utility_name) >= 0.9"
]

# linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)


In [111]:
%%time
linker.estimate_u_using_random_sampling(max_pairs=1e7)

INFO:splink.estimate_u:----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.estimate_u:
Estimated u probabilities using random sampling
INFO:splink.settings:
Your model is not yet fully trained. Missing estimates for:
    - plant_name (no m values are trained).
    - utility_name (no m values are trained).
    - construction_year (no m values are trained).
    - installation_year (no m values are trained).
    - capacity_mw (no m values are trained).
    - fuel_type_code_pudl (no m values are trained).
    - net_generation_mwh (no m values are trained).


CPU times: user 24 s, sys: 942 ms, total: 24.9 s
Wall time: 11.4 s


We can estimate m with either training labels or unsupervised, with Expectation Maximization.

In [112]:
linker.estimate_m_from_pairwise_labels("training_labels")

In [None]:
training_blocking_rule_1 = "l.plant_name = r.plant_name"
training_session_1 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule_1)

In [None]:
training_session_2 = linker.estimate_parameters_using_expectation_maximisation(block_on(["utility_name", "net_generation_mwh"]))

In [None]:
training_session_3 = linker.estimate_parameters_using_expectation_maximisation(block_on(["capacity_mw", "fuel_type_code_pudl"]))

In [113]:
linker.match_weights_chart()

In [114]:
linker.m_u_parameters_chart()

In [49]:
n = "splink_blocking_rules"

In [50]:
settings = linker.save_model_to_json(f"./splink_model_settings/model_settings_{n}.json", overwrite=True)

# Make Predictions

In [115]:
# df_preds = linker.predict(threshold_match_probability=0.5)
# df_preds = linker.predict(threshold_match_weight=0)
df_preds = linker.predict()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [116]:
sorted_preds_df = df_preds.as_pandas_dataframe().sort_values(by="match_probability", ascending=False)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [117]:
sorted_preds_df.head(3)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,plant_name_l,plant_name_r,gamma_plant_name,bf_plant_name,utility_name_l,utility_name_r,gamma_utility_name,tf_utility_name_l,tf_utility_name_r,bf_utility_name,bf_tf_adj_utility_name,construction_year_l,construction_year_r,gamma_construction_year,bf_construction_year,installation_year_l,installation_year_r,gamma_installation_year,bf_installation_year,capacity_mw_l,capacity_mw_r,gamma_capacity_mw,bf_capacity_mw,fuel_type_code_pudl_l,fuel_type_code_pudl_r,gamma_fuel_type_code_pudl,tf_fuel_type_code_pudl_l,tf_fuel_type_code_pudl_r,bf_fuel_type_code_pudl,bf_tf_adj_fuel_type_code_pudl,net_generation_mwh_l,net_generation_mwh_r,gamma_net_generation_mwh,bf_net_generation_mwh,plant_id_pudl_l,plant_id_pudl_r,utility_id_pudl_l,utility_id_pudl_r,report_year_l,report_year_r,match_key
4901177,37.035793,1.0,eia_df,ferc_df,6462_2014_plant_total_55936,f1_steam_2014_12_63_0_1,riverbend,river bend,3,1821.085157,entergy gulf states louisiana limited liabilit...,entergy gulf states louisiana limited liabilit...,4,0.000485,0.000485,258.849196,3.650185,1986-01-01,1986-01-01,3,86.335895,1986-01-01,1986-01-01,3,79.135378,1035.9,1036.0,4,204.838294,nuclear,nuclear,1,0.011633,0.011633,4.631838,16.798323,8154994.0,8154973.794,4,664.938701,492,492.0,4208.0,107.0,2014,2014,0
4901077,37.035793,1.0,eia_df,ferc_df,6462_2011_plant_total_55936,f1_steam_2011_12_63_0_1,riverbend,river bend,3,1821.085157,entergy gulf states louisiana limited liabilit...,entergy gulf states louisiana limited liabilit...,4,0.000485,0.000485,258.849196,3.650185,1986-01-01,1986-01-01,3,86.335895,1986-01-01,1986-01-01,3,79.135378,1035.9,1036.0,4,204.838294,nuclear,nuclear,1,0.011633,0.011633,4.631838,16.798323,7686445.0,7686445.0,4,664.938701,492,492.0,4208.0,107.0,2011,2011,0
4900946,37.035793,1.0,eia_df,ferc_df,6462_2010_plant_total_55936,f1_steam_2010_12_63_0_1,riverbend,river bend,3,1821.085157,entergy gulf states louisiana limited liabilit...,entergy gulf states louisiana limited liabilit...,4,0.000485,0.000485,258.849196,3.650185,1986-01-01,1986-01-01,3,86.335895,1986-01-01,1986-01-01,3,79.135378,1035.9,1036.0,4,204.838294,nuclear,nuclear,1,0.011633,0.011633,4.631838,16.798323,8363163.0,8363163.0,4,664.938701,492,492.0,4208.0,107.0,2010,2010,0


In [118]:
one_to_one_preds = sorted_preds_df.groupby("record_id_r").first()

In [119]:
matching_cols_used = ["plant_name",
                     "utility_name",
                     "fuel_type_code_pudl",
                     "installation_year",
                     "construction_year",
                     "capacity_mw",
                     "net_generation_mwh",
                     # "capacity_factor",
                     # "total_mmbtu",
                     # "total_fuel_cost",
                     # "heat_rate_mmbtu_mwh",
                     # "fuel_cost_per_mmbtu"
                    ]

In [120]:
cols = [col + "_l" for col in matching_cols_used]
cols += [col + "_r" for col in matching_cols_used]
extra_cols = ["plant_id_pudl_l", "plant_id_pudl_r", "utility_id_pudl_l", "utility_id_pudl_r"]
cols.sort()
cols = ["record_id_l", "match_weight", "match_probability"] + cols + extra_cols
one_to_one_preds = one_to_one_preds[cols].reset_index()

In [121]:
labels_df = train_df.copy()

In [122]:
n_labels = len(labels_df)

In [123]:
# how many FERC records had matches above the match threshold
predicted_matches = labels_df.merge(
    one_to_one_preds,
    how="left",
    on=["record_id_r"],
    indicator=True,
    suffixes=("_true", "_pred"))

In [124]:
# how many FERC train records had matches above the match threshold
predicted_matches._merge.value_counts()

_merge
both          9285
left_only        2
right_only       0
Name: count, dtype: int64

In [125]:
def get_splink_true_pos(pred_df, train_df):
    return train_df.merge(
                pred_df,
                how="left",
                on=["record_id_r", "record_id_l"],
                indicator=True
            )._merge.value_counts()["both"]

def get_splink_false_pos(pred_df, train_df):
    # where an incorrect EIA record is predicted for a FERC record
    shared_preds = train_df.merge(
        pred_df,
        how="inner",
        on="record_id_r",
        suffixes=("_true", "_pred")
    )
    return len(shared_preds[shared_preds.record_id_l_true != shared_preds.record_id_l_pred])

# in training data but no prediction made
def get_splink_false_neg(pred_df, train_df):
    return train_df.merge(
                pred_df,
                how="left",
                on=["record_id_r", "record_id_l"],
                indicator=True
            )._merge.value_counts()["left_only"]

def get_duplicated_eia_plant_part_matches(pred_df):
    return len(pred_df[(pred_df.record_id_l.notnull()) & (pred_df.record_id_l.duplicated(keep="first"))])

def get_splink_match_at_threshold(df, threshold):
    return df[df.match_probability >= threshold]

In [126]:
ind = ["splink", "splink_.95", "splink_.9", "splink_.75", "splink_.5", "splink_.25"]

data = {"true_pos": [get_splink_true_pos(one_to_one_preds, train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.95), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.9), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.75), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.5), train_df),
                     get_splink_true_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.25), train_df)
                    ],
        "false_pos": [get_splink_false_pos(one_to_one_preds, train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.95), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.9), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.75), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.5), train_df),
                      get_splink_false_pos(get_splink_match_at_threshold(one_to_one_preds, threshold=.25), train_df)
                     ],
        "false_neg": [get_splink_false_neg(one_to_one_preds, train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(one_to_one_preds, threshold=.95), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(one_to_one_preds, threshold=.9), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(one_to_one_preds, threshold=.75), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(one_to_one_preds, threshold=.5), train_df),
                      get_splink_false_neg(get_splink_match_at_threshold(one_to_one_preds, threshold=.25), train_df)
                     ]
       }
# cols = ["PUDL", "splink_.75", "splink_.5", "splink_.25"]

stats_df = pd.DataFrame(index=ind, data=data)
stats_df.loc[:, "precision"] = stats_df["true_pos"]/(stats_df["true_pos"] + stats_df["false_pos"])
stats_df.loc[:, "recall"] = stats_df["true_pos"]/(stats_df["true_pos"] + stats_df["false_neg"])

In [127]:
stats_df

Unnamed: 0,true_pos,false_pos,false_neg,precision,recall
splink,8470,815,817,0.912224,0.912028
splink_.95,7736,459,1551,0.94399,0.832992
splink_.9,7829,470,1458,0.943367,0.843006
splink_.75,8006,508,1281,0.940334,0.862065
splink_.5,8126,529,1161,0.938879,0.874987
splink_.25,8235,549,1052,0.9375,0.886723


In [70]:
stats_df.to_csv("splink_blocking_rules_stats.csv")

In [128]:
ind = ["splink", "splink_.9", "splink_.75", "splink_.5", "splink_.25"]
data = {"duplicate_eia_plant_part_matches": [get_duplicated_eia_plant_part_matches(one_to_one_preds),
                                             get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(one_to_one_preds, threshold=.9)),
                                             get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(one_to_one_preds, threshold=.75)),
                                             get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(one_to_one_preds, threshold=.5)),
                                             get_duplicated_eia_plant_part_matches(get_splink_match_at_threshold(one_to_one_preds, threshold=.25))
                                            ]
       }
dupe_df = pd.DataFrame(index=ind, data=data)

In [129]:
dupe_df

Unnamed: 0,duplicate_eia_plant_part_matches
splink,1706
splink_.9,408
splink_.75,466
splink_.5,524
splink_.25,565


In [73]:
sorted_preds_df.to_parquet("splink_blocking_rules_preds.parquet")

In [74]:
linker.precision_recall_chart_from_labels_table("training_labels")

In [75]:
errs_df = linker.prediction_errors_from_labels_table("training_labels", threshold=.9)

In [76]:
errs_df = errs_df.as_pandas_dataframe()

In [77]:
errs_df.truth_status.value_counts()

truth_status
FN    1109
Name: count, dtype: int64

In [78]:
errs_df.found_by_blocking_rules.value_counts()

found_by_blocking_rules
True     957
False    152
Name: count, dtype: int64

In [79]:
errs_df_dedupe = errs_df.drop_duplicates(subset=["record_id_r", "record_id_l"])

In [80]:
errs_df_dedupe.truth_status.value_counts()

truth_status
FN    1109
Name: count, dtype: int64

In [81]:
errs_df_dedupe.found_by_blocking_rules.value_counts()

found_by_blocking_rules
True     957
False    152
Name: count, dtype: int64

In [83]:
errs_df[errs_df.found_by_blocking_rules == False]

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,plant_name_l,plant_name_r,gamma_plant_name,bf_plant_name,utility_name_l,utility_name_r,gamma_utility_name,tf_utility_name_l,tf_utility_name_r,bf_utility_name,bf_tf_adj_utility_name,construction_year_l,construction_year_r,gamma_construction_year,bf_construction_year,installation_year_l,installation_year_r,gamma_installation_year,bf_installation_year,capacity_mw_l,capacity_mw_r,gamma_capacity_mw,bf_capacity_mw,fuel_type_code_pudl_l,fuel_type_code_pudl_r,gamma_fuel_type_code_pudl,tf_fuel_type_code_pudl_l,tf_fuel_type_code_pudl_r,bf_fuel_type_code_pudl,bf_tf_adj_fuel_type_code_pudl,net_generation_mwh_l,net_generation_mwh_r,gamma_net_generation_mwh,bf_net_generation_mwh,plant_id_pudl_l,plant_id_pudl_r,utility_id_pudl_l,utility_id_pudl_r,report_year_l,report_year_r,match_key,clerical_match_score,found_by_blocking_rules,truth_status
54,2.368463,8.377652e-01,eia_df,ferc_df,6073_2020_plant_owned_7801,f1_steam_2020_12_62_0_5,victor j daniel jr,daniel,0,0.089651,gulf power company,gulf power company,4,0.001094,0.001094,261.194939,1.604551,1977-01-01,1977-01-01,3,83.784271,2001-01-01,1981-01-01,0,0.05200,548.3,548.00,3,40.644514,,coal,-1,,0.09181,1.0,1.0,1.138218e+06,1138214.500,4,686.308200,146,146.0,130,130.0,2020,2020,from_labels,1,False,FN
55,-3.819733,6.613479e-02,eia_df,ferc_df,6073_2019_plant_owned_7801,f1_steam_2019_12_62_1_1,victor j daniel jr,daniel,0,0.089651,gulf power company,gulf power company,4,0.001094,0.001094,261.194939,1.604551,1977-01-01,1977-01-01,3,83.784271,2001-01-01,1981-01-01,0,0.05200,548.3,548.25,4,214.745768,,coal,-1,,0.09181,1.0,1.0,1.031401e+06,1175813.500,1,1.781411,146,146.0,130,130.0,2019,2019,from_labels,1,False,FN
56,2.302299,8.314350e-01,eia_df,ferc_df,6073_2016_plant_owned_7801,f1_steam_2016_12_62_0_3,victor j daniel jr,daniel,0,0.089651,gulf power company,gulf power company,4,0.001094,0.001094,261.194939,1.604551,1977-01-01,1977-01-01,3,83.784271,2001-01-01,1981-01-01,0,0.05200,548.3,548.25,4,214.745768,,coal,-1,,0.09181,1.0,1.0,1.033935e+06,1035241.000,3,124.073573,146,146.0,130,130.0,2016,2016,from_labels,1,False,FN
57,2.302299,8.314350e-01,eia_df,ferc_df,6073_2015_plant_owned_7801,f1_steam_2015_12_62_0_4,victor j daniel jr,daniel,0,0.089651,gulf power company,gulf power company,4,0.001094,0.001094,261.194939,1.604551,1977-01-01,1977-01-01,3,83.784271,2001-01-01,1981-01-01,0,0.05200,548.3,548.25,4,214.745768,,coal,-1,,0.09181,1.0,1.0,1.219944e+06,1216942.000,3,124.073573,146,146.0,130,130.0,2015,2015,from_labels,1,False,FN
58,0.819228,6.382658e-01,eia_df,ferc_df,641_2005_plant_total_7801,f1_steam_2005_12_62_0_1,gulf clean energy center,crist,0,0.089651,gulf power company,gulf power company,4,0.001094,0.001094,261.194939,1.604551,1949-01-01,1945-01-01,0,0.110701,1973-01-01,1973-01-01,3,74.38541,1200.7,1200.88,3,40.644514,,coal,-1,,0.09181,1.0,1.0,5.008182e+06,5007387.000,3,124.073573,138,138.0,130,130.0,2005,2005,from_labels,1,False,FN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,-33.579156,7.792320e-11,eia_df,ferc_df,6113_2018_plant_total_40211,f1_gnrt_plant_2018_12_294_0_4,gibson,highland,0,0.089651,wabash valley power assn incorporated,,-1,0.001915,,1.000000,1.000000,1975-01-01,1995-01-01,0,0.110701,1982-01-01,NaT,-1,1.00000,3339.5,191.00,0,0.067885,coal,,-1,0.091810,,1.0,1.0,1.763180e+07,86302.900,0,0.102275,222,,352,,2018,2018,from_labels,1,False,FN
1046,-10.933614,5.110132e-04,eia_df,ferc_df,3289_2018_plant_total_17539,f1_gnrt_plant_2018_12_159_0_3,neal shoals,project #2315,0,0.089651,south carolina electric&gas company,,-1,0.002684,,1.000000,1.000000,NaT,1905-01-01,-1,1.000000,NaT,NaT,-1,1.00000,4.4,4.41,3,40.644514,hydro,,-1,0.166394,,1.0,1.0,2.615800e+04,26023.000,3,124.073573,2504,,292,,2018,2018,from_labels,1,False,FN
1047,-30.289675,7.619024e-10,eia_df,ferc_df,62879_2018_plant_total_62715,f1_gnrt_plant_2018_12_45_0_3,columbia bryson,bryson project 2601,0,0.089651,westbound solar limited liability company,,-1,0.000122,,1.000000,1.000000,2018-01-01,1925-01-01,0,0.110701,2018-01-01,NaT,-1,1.00000,1.4,1.00,0,0.067885,solar,,-1,0.070600,,1.0,1.0,,4632.000,-1,1.000000,13001,,6634,,2018,2018,from_labels,1,False,FN
1057,-11.500059,3.451337e-04,eia_df,ferc_df,60680_2018_plant_total_195,f1_gnrt_plant_2018_12_2_0_2,anad solar array,anniston army dept,0,0.089651,alabama power company,,-1,0.004647,,1.000000,1.000000,2017-01-01,2017-01-01,3,83.784271,2017-01-01,NaT,-1,1.00000,7.4,7.38,3,40.644514,solar,,-1,0.070600,,1.0,1.0,1.716800e+04,,-1,1.000000,10436,,18,,2018,2018,from_labels,1,False,FN


In [126]:
linker.roc_chart_from_labels_table("training_labels")

## Look at incorrect matches

In [527]:
incorrect_matches = predicted_test_matches[~correct_filter][["record_id_r", "record_id_l_true", "record_id_l_pred", "match_weight", "match_probability"]]
incorrect_matches

Unnamed: 0,record_id_r,record_id_l_true,record_id_l_pred,match_weight,match_probability
14,f1_steam_2017_12_17_1_2,2713_2017_plant_total_3046,58212_11_2017_plant_gen_total_5416_retired,-21.215227,4.107524e-07
21,f1_steam_2019_12_454_3_2,6462_2019_plant_total_11241,6595_1_2019_plant_gen_total_16524,-30.328579,7.416312e-10
37,f1_steam_2010_12_17_4_1,2706_gt_2010_plant_prime_mover_total_3046,2706_2010_plant_total_3046,-14.187214,5.360430e-05
40,f1_steam_2020_12_56_0_1,6045_2020_plant_owned_6452,6045_2020_plant_total_6452,22.586408,9.999998e-01
48,f1_steam_2006_12_57_6_4,6124_ct1_2006_plant_gen_total_7140,56150_10st_2006_plant_gen_total_7140,-10.003852,9.730111e-04
...,...,...,...,...,...
2276,f1_steam_2015_12_56_2_1,6045_2015_plant_owned_6452,6045_2015_plant_total_6452,19.418282,9.999986e-01
2280,f1_steam_2014_12_159_4_5,3298_2_2014_plant_gen_total_17554,3298_1_2014_plant_gen_total_17554,13.435007,9.999097e-01
2297,f1_gnrt_plant_2008_12_144_0_3,1010_ic_2008_plant_prime_mover_total_15470,1010_2008_plant_owned_40211,-20.073615,9.062321e-07
2311,f1_steam_2019_12_182_1_3,2098_1_2019_plant_unit_total_56211,2098_gt_2019_plant_prime_mover_total_56211,-7.331470,6.170487e-03


In [528]:
i = 3

In [529]:
incorrect_matches.iloc[i].record_id_r, incorrect_matches.iloc[i].record_id_l_true, incorrect_matches.iloc[i].record_id_l_pred

('f1_steam_2020_12_56_0_1',
 '6045_2020_plant_owned_6452',
 '6045_2020_plant_total_6452')

In [530]:
rec_true = sorted_preds_df[(sorted_preds_df.record_id_r == incorrect_matches.iloc[i].record_id_r) & (sorted_preds_df.record_id_l == incorrect_matches.iloc[i].record_id_l_true)]
rec_true = rec_true.to_dict(orient="records")
linker.waterfall_chart(rec_true, filter_nulls=False)

In [531]:
rec_pred = sorted_preds_df[(sorted_preds_df.record_id_r == incorrect_matches.iloc[i].record_id_r) & (sorted_preds_df.record_id_l == incorrect_matches.iloc[i].record_id_l_pred)]
rec_pred = rec_pred.to_dict(orient="records")
linker.waterfall_chart(rec_pred, filter_nulls=False)

In [532]:
# are there any notes for that match?
train_full[train_full.record_id_ferc1 == incorrect_matches.iloc[i].record_id_r].iloc[0].notes

'capacity matches total, the generation is closer to owned'

In [47]:
train_full[train_full.notes.notnull()].notes.value_counts().head(10)

notes
ferc mwh vs kwh issue with net gen                                                                                                                                                                                                                            365
ferc units off                                                                                                                                                                                                                                                 89
In 2020 FERC This record swaps from South Carolina Electric & Gas Company (pudl id 292) to South Carolina Generating Company, Inc. (pudl id 293) I think to reflect Dominion buying it in 2019? EIA always reports it as 293 so it looks wrong until 2020.     61
wrong utility, right plant                                                                                                                                                                                                  

Do `utility_id_pudl` and `plant_id_pudl` generally match up?

In [382]:
consistent_id_df = one_to_one_preds.dropna(subset=["utility_id_pudl_l",
                                                   "utility_id_pudl_r",
                                                   "plant_id_pudl_l",
                                                   "plant_id_pudl_r"
                                                  ])

In [383]:
(consistent_id_df.plant_id_pudl_l == consistent_id_df.plant_id_pudl_r).value_counts()

True     32131
False     8149
Name: count, dtype: int64

In [384]:
(consistent_id_df.utility_id_pudl_l == consistent_id_df.utility_id_pudl_r).value_counts()

True     34658
False     5622
Name: count, dtype: int64