In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
import json
import os
from pathlib import Path

import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix
from splink import block_on, DuckDBAPI, Linker, SettingsCreator
from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks
import splink.comparison_library as cl
import splink.comparison_level_library as cll
from splink.exploratory import completeness_chart, profile_columns
from upath import UPath

from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (
    BLOCKING_RULES,
    MATCH_COLS,
    SHARED_COLS,
    address_comparison,
    city_comparison,
    company_name_comparison,
    deterministic_blocking_rules,
    state_comparison
)

# Inputs

### EIA

In [3]:
eia_df = pd.read_parquet("gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet")

In [4]:
eia_df.head(3)

Unnamed: 0,record_id,company_name,street_address,utility_id_eia,utility_id_pudl,company_name_raw,report_date,city,state,zip_code,plants_reported_owner,plants_reported_operator,plants_reported_asset_manager,plants_reported_other_relationship,entity_type,attention_line,street_address_2,zip_code_4,contact_firstname,contact_lastname,contact_title,phone_number,phone_extension,contact_firstname_2,contact_lastname_2,contact_title_2,phone_number_2,phone_extension_2,data_maturity,report_year,company_name_no_legal,company_name_mphone
0,0,0ham wham8 solar limited liability company,100 california st suite 400,64380,8321.0,"0ham wham8 solar, llc",2023-01-01,san francisco,ca,94118,True,,,,Q,,,,,,,,,,,,,,final,2023,0ham wham8 solar,HM HM SLR
1,1,10 briggs solar ng limited liability company,267 water st 2nd floor,62685,8502.0,"10 briggs solar ng, llc",2020-01-01,warren,ri,2885,True,True,,,Q,,,,,,,,,,,,,,final,2020,10 briggs solar ng,BRKS SLR NK
2,2,1001 ebenezer church solar limited liability c...,176 ebenezer church rd,63186,8567.0,"1001 ebenezer church solar, llc",2020-01-01,state road,nc,28676,True,,,,Q,,,,,,,,,,,,,,final,2020,1001 ebenezer church solar,EBNSR XRX SLR


In [5]:
len(eia_df)

20821

### SEC 10K Basic Info

In [100]:
sec_df = pd.read_pickle("/Users/katielamb/CatalystCoop/dagster_home/storage/core_sec_10k__parents_and_subsidiaries")

In [101]:
sec_df.head(3)

Unnamed: 0,record_id,company_name,street_address,filename,phone_number,central_index_key,city,company_name_raw,date_of_name_change,film_number,fiscal_year_end,form_type,former_conformed_name,irs_number,organization_name,sec_act,sec_file_number,standard_industrial_classification,state,state_of_incorporation,street_address_2,zip_code,report_date,report_year,location_of_inc,company_name_no_legal,company_name_mphone,files_10k,sec_company_id
0,0,024 pharma incorporated,224 datura st,edgar/data/1307969/0001683168-17-000653.txt,(732) 696-9333,1307969,west palm beach,"024 pharma, inc.",20091202.0,17711535,1231,10-k,"b green innovations, inc.",201862731,,1934 act,333-120490,"plastics products, nec [3089]",fl,nj,,33401,2017-03-24,2017,new jersey,024 pharma,FRM,True,1307969
1,1,1 800 contacts incorporated,13751 s wadsworth park dr suite d140,edgar/data/1050122/0001104659-06-017311.txt,8015728225,1050122,draper,1 800 contacts inc,,6691791,1231,10-k,,870571643,,1934 act,000-23633,retail-catalog & mail-order houses [5961],ut,de,,84020,2006-03-16,2006,delaware,1 800 contacts,KNTKTS,True,1050122
2,2,1 800 contacts incorporated,66 e wadsworth park dr,edgar/data/1050122/0001104659-07-019474.txt,801-316-5000,1050122,draper,1 800 contacts inc,,7696033,1231,10-k,,870571643,,1934 act,000-23633,retail-catalog & mail-order houses [5961],ut,de,,84020,2007-03-15,2007,delaware,1 800 contacts,KNTKTS,True,1050122


In [102]:
len(sec_df)

61026

# Preprocess SEC and EIA

Does it make more sense to do a direct match on company name after
the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?

In [103]:
sec_match_df = sec_df[sec_df.files_10k][SHARED_COLS]

In [104]:
eia_match_df = eia_df[SHARED_COLS]

In [105]:
sec_match_df.record_id.is_unique

True

In [106]:
eia_match_df.record_id.is_unique

True

In [107]:
# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair
# later we'll flatten on sec_company_id and utility_id_eia
sec_df.sec_company_id.is_unique

False

There can be duplicate records because sometimes a company changes utility ID or central index key over time. Keep the most recent version of that record.

In [108]:
eia_match_df.duplicated(subset=MATCH_COLS).value_counts()

False    20821
Name: count, dtype: int64

In [109]:
sec_match_df.duplicated(subset=MATCH_COLS).value_counts()

False    61026
Name: count, dtype: int64

In [253]:
sec_match_df = sec_match_df.sort_values(by="report_year", ascending=False).drop_duplicates(subset=MATCH_COLS, keep="first")
eia_match_df = eia_match_df.sort_values(by="report_year", ascending=False).drop_duplicates(subset=MATCH_COLS, keep="first")

# Link SEC and EIA

## Exploratory Analysis

In [112]:
db_api = DuckDBAPI()

In [113]:
completeness_chart(sec_match_df, db_api=db_api)

In [114]:
completeness_chart(eia_match_df, db_api=db_api)

In [115]:
profile_columns(sec_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

In [116]:
profile_columns(eia_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

## Blocking

In [117]:
# useful for experimenting with a new blocking rule
counts = count_comparisons_from_blocking_rule(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rule=BLOCKING_RULES[0],
    link_type="link_only",
    unique_id_column_name='record_id',
    db_api=db_api,
)

counts

{'number_of_comparisons_generated_pre_filter_conditions': 487944,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 487944,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',
 'link_type_join_condition': 'where l."source_dataset" || \'-__-\' || l."record_id" < r."source_dataset" || \'-__-\' || r."record_id" and l."source_dataset" != r."source_dataset"'}

In [118]:
result = n_largest_blocks(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rule=BLOCKING_RULES[0],
    link_type="link_only",
    db_api=db_api,
    n_largest=3
)

result.as_pandas_dataframe()

Unnamed: 0,key_0,count_l,count_r,block_count
0,INTR,445,76,33820
1,AMRK,851,38,32338
2,FRST,816,36,29376


In [121]:
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rules=BLOCKING_RULES,
    db_api=db_api,
    unique_id_column_name='record_id',
    link_type="link_only",
)

## Create Model

In [334]:
# NOT USED
company_name_comparison = cl.CustomComparison(
    comparison_levels = [
        cll.NullLevel("company_name"),
        cll.ExactMatchLevel("company_name"),
        # cll.ExactMatchLevel("company_name_no_legal"),
        # cll.LevenshteinLevel("company_name", distance_threshold=1),
        cll.JaroWinklerLevel("company_name_no_legal", distance_threshold=.95),
        # cll.ArraySubsetLevel("company_name_mphone_list"),
        cll.ArrayIntersectLevel("company_name_mphone_list", min_intersection=3)
    ],
    output_column_name="company_name",
    comparison_description=None
)

In [422]:
# NOT USED
address_comparison = cl.CustomComparison(
    comparison_levels = [
        cll.NullLevel("street_address"),
        cll.ExactMatchLevel("street_address"),
        cll.LevenshteinLevel("street_address", distance_threshold=1),
        cll.ArraySubsetLevel("street_address_list"),
    ],
    output_column_name="street_address",
    comparison_description=None
)

In [122]:
print(company_name_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'NameComparison' of "company_name_no_legal".
Similarity is assessed using the following ComparisonLevels:
    - 'company_name_no_legal is NULL' with SQL rule: "company_name_no_legal_l" IS NULL OR "company_name_no_legal_r" IS NULL
    - 'Exact match on company_name_no_legal' with SQL rule: "company_name_no_legal_l" = "company_name_no_legal_r"
    - 'Jaro-Winkler distance of company_name_no_legal >= 0.95' with SQL rule: jaro_winkler_similarity("company_name_no_legal_l", "company_name_no_legal_r") >= 0.95
    - 'All other comparisons' with SQL rule: ELSE



In [123]:
print(address_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'LevenshteinAtThresholds' of "street_address".
Similarity is assessed using the following ComparisonLevels:
    - 'street_address is NULL' with SQL rule: "street_address_l" IS NULL OR "street_address_r" IS NULL
    - 'Exact match on street_address' with SQL rule: "street_address_l" = "street_address_r"
    - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein("street_address_l", "street_address_r") <= 1
    - 'All other comparisons' with SQL rule: ELSE



In [124]:
print(state_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'ExactMatch' of "state".
Similarity is assessed using the following ComparisonLevels:
    - 'state is NULL' with SQL rule: "state_l" IS NULL OR "state_r" IS NULL
    - 'Exact match on state' with SQL rule: "state_l" = "state_r"
    - 'All other comparisons' with SQL rule: ELSE



In [125]:
print(city_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'NameComparison' of "city".
Similarity is assessed using the following ComparisonLevels:
    - 'city is NULL' with SQL rule: "city_l" IS NULL OR "city_r" IS NULL
    - 'Exact match on city' with SQL rule: "city_l" = "city_r"
    - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity("city_l", "city_r") >= 0.9
    - 'All other comparisons' with SQL rule: ELSE



In [126]:
settings = SettingsCreator(
    link_type="link_only",
    unique_id_column_name="record_id",
    comparisons=[
        company_name_comparison,
        address_comparison,
        state_comparison,
        city_comparison
    ],
    blocking_rules_to_generate_predictions=BLOCKING_RULES,
    retain_intermediate_calculation_columns=True,
)

linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())

In [127]:
linker.training.estimate_probability_two_random_records_match(deterministic_blocking_rules, recall=0.95)

Probability two random records match is estimated to be  2.37e-06.
This means that amongst all possible pairwise record comparisons, one in 421,176.28 are expected to match.  With 1,270,622,346 total possible comparisons, we expect a total of around 3,016.84 matching pairs


In [128]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e8)

----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - company_name_no_legal (no m values are trained).
    - street_address (no m values are trained).
    - state (no m values are trained).
    - city (no m values are trained).


In [129]:
training_blocking_rule = block_on("company_name", "company_name")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."company_name" = r."company_name") AND (l."company_name" = r."company_name")

Parameter estimates will be made for the following comparison(s):
    - company_name_no_legal
    - street_address
    - state
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Level Jaro-Winkler distance of company_name_no_legal >= 0.95 on comparison company_name_no_legal not observed in dataset, unable to train m value

Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value

Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`
Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match
Iteration 

In [130]:
training_blocking_rule = block_on("street_address", "street_address")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."street_address" = r."street_address") AND (l."street_address" = r."street_address")

Parameter estimates will be made for the following comparison(s):
    - company_name_no_legal
    - state
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - street_address

Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`
Iteration 2: Largest change in params was 0.477 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0395 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.0443 in the m_probability of city, level `All other comparisons`
Iteration 5: Largest change in params was 0.0195 in probability_two_random_records_match
Iteration 6: Largest change in params was

In [131]:
linker.visualisations.match_weights_chart()

In [132]:
linker.visualisations.m_u_parameters_chart()

In [420]:
settings = linker.misc.save_model_to_json(
    "model_unsupervised_city_state_0.json", overwrite=True
)

## Make Predictions

In [133]:
# it's helpful to keep threshold at .5 just to see what makes it into blocking
# df_predictions = linker.inference.predict(threshold_match_probability=0.5)
df_predictions = linker.inference.predict()

Blocking time: 0.16 seconds
Predict time: 0.31 seconds


In [134]:
preds_df = df_predictions.as_pandas_dataframe()

In [135]:
preds_df.sort_values(by="match_probability")

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_no_legal_l,company_name_no_legal_r,gamma_company_name_no_legal,tf_company_name_no_legal_l,tf_company_name_no_legal_r,bf_company_name_no_legal,bf_tf_adj_company_name_no_legal,street_address_l,street_address_r,gamma_street_address,tf_street_address_l,tf_street_address_r,bf_street_address,bf_tf_adj_street_address,state_l,state_r,gamma_state,tf_state_l,tf_state_r,bf_state,bf_tf_adj_state,city_l,city_r,gamma_city,tf_city_l,tf_city_r,bf_city,bf_tf_adj_city,company_name_mphone_l,company_name_mphone_r,match_key
295287,-22.970759,1.216501e-07,__splink__input_table_0,__splink__input_table_1,9829,3043,capitol bancorp,capital power,0,0.000024,0.000012,0.986045,1.000000,capitol bancorp ctr,120010423 101 st nw,0,0.000012,0.000110,0.881657,1.000000,mi,ab,0,0.015147,0.000197,0.198711,1.000000,lansing,edmonton,0,0.000293,0.000428,0.296590,1.000000,KPTL BNKRP,KPTL PWR,0
383898,-22.970759,1.216501e-07,__splink__input_table_0,__splink__input_table_1,51783,17550,state bancorp,state street bank and trust,0,0.000024,0.000024,0.986045,1.000000,2 jericho plz,100 summer st,0,0.000012,0.000024,0.881657,1.000000,ny,ma,0,0.120228,0.041765,0.198711,1.000000,jericho,boston,0,0.000306,0.014319,0.296590,1.000000,STT BNKRP,STT STRT BNK ANT TRST,0
383897,-22.970759,1.216501e-07,__splink__input_table_0,__splink__input_table_1,51782,17550,state auto financial,state street bank and trust,0,0.000024,0.000024,0.986045,1.000000,518 east broad st,100 summer st,0,0.000012,0.000024,0.881657,1.000000,oh,ma,0,0.016991,0.041765,0.198711,1.000000,columbus,boston,0,0.002788,0.014319,0.296590,1.000000,STT AT FNNXL,STT STRT BNK ANT TRST,0
383896,-22.970759,1.216501e-07,__splink__input_table_0,__splink__input_table_1,51781,17550,state auto financial,state street bank and trust,0,0.000024,0.000024,0.986045,1.000000,518 e broad st,100 summer st,0,0.000012,0.000024,0.881657,1.000000,oh,ma,0,0.016991,0.041765,0.198711,1.000000,columbus,boston,0,0.002788,0.014319,0.296590,1.000000,STT AT FNNXL,STT STRT BNK ANT TRST,0
383895,-22.970759,1.216501e-07,__splink__input_table_0,__splink__input_table_1,51780,3805,starz,citrus world,0,0.000024,0.000049,0.986045,1.000000,8900 liberty cir,20205 hwy 2720205 hwy 27,0,0.000024,0.000012,0.881657,1.000000,co,fl,0,0.023802,0.048477,0.198711,1.000000,englewood,lake wales,0,0.002947,0.000049,0.296590,1.000000,STRS,STRS WRLT,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186872,27.519625,1.000000e+00,__splink__input_table_0,__splink__input_table_1,39816,13109,northwestern public service,northwestern public service,2,0.000073,0.000073,652179.111493,0.010580,33 third st se,33 third st se,2,0.000037,0.000037,9450.378101,0.317122,sd,sd,1,0.001930,0.001930,15.873789,26.483035,huron,huron,2,0.000073,0.000073,108.031428,86.293486,NR0WSTRN PBLK SRFS,NR0WSTRN PBLK SRFS,0
580681,27.526533,1.000000e+00,__splink__input_table_0,__splink__input_table_1,24650,8047,green mountain power,green mountain power,2,0.000037,0.000037,652179.111493,0.021160,163 acorn ln,163 acorn ln,2,0.000037,0.000037,9450.378101,0.317122,vt,vt,1,0.001537,0.001537,15.873789,33.262692,colchester,colchester,2,0.000183,0.000183,108.031428,34.517394,KRN MNTN PWR,KRN MNTN PWR,0
438193,27.757357,1.000000e+00,__splink__input_table_0,__splink__input_table_1,58842,19906,wausau paper mills,wausau paper mills,2,0.000024,0.000024,652179.111493,0.031739,one clarks is,one clarks is,2,0.000024,0.000024,9450.378101,0.475683,wi,wi,1,0.008840,0.008840,15.873789,5.782805,wausau,wausau,2,0.000061,0.000061,108.031428,103.552183,WS PPR MLS,WS PPR MLS,0
385934,27.884385,1.000000e+00,__splink__input_table_0,__splink__input_table_1,51567,17450,st joseph light and power,st joseph light and power,2,0.000024,0.000024,652179.111493,0.031739,520 francis st,520 francis st,2,0.000024,0.000024,9450.378101,0.475683,mo,mo,1,0.010118,0.010118,15.873789,5.052049,st joseph,st joseph,2,0.000049,0.000049,108.031428,129.440229,ST JSF LT ANT PWR,ST JSF LT ANT PWR,0


In [136]:
# join on utility_id_eia and CIK
preds_validation_df = preds_df.merge(sec_df[["record_id", "sec_company_id", "central_index_key", "company_name_raw"]],
                                     how="left",
                                     left_on="record_id_l",
                                     right_on="record_id")

In [137]:
preds_validation_df = preds_validation_df.merge(eia_df[["record_id", "utility_id_eia"]],
                                                how="left",
                                                left_on="record_id_r",
                                                right_on="record_id")

In [138]:
preds_validation_df = preds_validation_df.sort_values(
    by=["sec_company_id", "utility_id_eia", "match_probability"], ascending=False
).drop_duplicates(subset=["sec_company_id", "utility_id_eia"], keep="first")

In [139]:
preds_validation_df[preds_validation_df.match_probability > .9]

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_no_legal_l,company_name_no_legal_r,gamma_company_name_no_legal,tf_company_name_no_legal_l,tf_company_name_no_legal_r,bf_company_name_no_legal,bf_tf_adj_company_name_no_legal,street_address_l,street_address_r,gamma_street_address,tf_street_address_l,tf_street_address_r,bf_street_address,bf_tf_adj_street_address,state_l,state_r,gamma_state,tf_state_l,tf_state_r,bf_state,bf_tf_adj_state,city_l,city_r,gamma_city,tf_city_l,tf_city_r,bf_city,bf_tf_adj_city,company_name_mphone_l,company_name_mphone_r,match_key,record_id_x,sec_company_id,central_index_key,company_name_raw,record_id_y,utility_id_eia
466134,3.824596,0.934073,__splink__input_table_0,__splink__input_table_1,14692,6293,crane,entergy nuclear power marketing,0,0.000012,0.000012,0.986045,1.000000,100 first stamford pl,100 first stamford pl,2,0.000122,0.000122,9450.378101,0.095137,ct,ct,1,0.020876,0.020876,15.873789,2.448667,stamford,stamford,2,0.003950,0.003950,108.031428,1.602975,KRN,ENTRJ NKLR PWR MRKTNK,1,14692,0001944013,0001944013,crane co,6293,55243
466594,4.620005,0.960922,__splink__input_table_0,__splink__input_table_1,17752,5535,dte electric securitization funding i,dte sustainable generation,0,0.000012,0.000012,0.986045,1.000000,one energy plz,one energy plz,2,0.000330,0.000330,9450.378101,0.035236,mi,mi,1,0.015147,0.015147,15.873789,3.374867,detroit,detroit,2,0.001162,0.001162,108.031428,5.450115,TT ELKTRK SKRTSXN FNTNK I,TT SSTNBL JNRXN,1,17752,0001876068,0001876068,dte electric securitization funding i llc,5535,64331
480747,4.620005,0.960922,__splink__input_table_0,__splink__input_table_1,17752,5522,dte electric securitization funding i,dte electric,0,0.000012,0.000037,0.986045,1.000000,one energy plz,one energy plz,2,0.000330,0.000330,9450.378101,0.035236,mi,mi,1,0.015147,0.015147,15.873789,3.374867,detroit,detroit,2,0.001162,0.001162,108.031428,5.450115,TT ELKTRK SKRTSXN FNTNK I,TT ELKTRK,0,17752,0001876068,0001876068,dte electric securitization funding i llc,5522,5109
464506,6.019599,0.984820,__splink__input_table_0,__splink__input_table_1,14051,10935,constellation energy,luminace solar rhode island,0,0.000024,0.000024,0.986045,1.000000,1310 pt st,1310 pt st,2,0.000024,0.000024,9450.378101,0.475683,md,md,1,0.025130,0.025130,15.873789,2.034167,baltimore,baltimore,2,0.003583,0.003583,108.031428,1.767102,KNSTLXN ENRJ,LMNS SLR RHT ISLNT,1,14051,0001868275,0001868275,constellation energy corp,10935,62679
340973,6.201744,0.986596,__splink__input_table_0,__splink__input_table_1,14051,4420,constellation energy,constellation newenergy,1,0.000024,0.000024,5704.210475,1.000000,1310 pt st,100 constellation way,0,0.000024,0.000183,0.881657,1.000000,md,md,1,0.025130,0.025130,15.873789,2.034167,baltimore,baltimore,2,0.003583,0.003583,108.031428,1.767102,KNSTLXN ENRJ,KNSTLXN NWNRJ,0,14051,0001868275,0001868275,constellation energy corp,4420,58491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464642,5.308053,0.975380,__splink__input_table_0,__splink__input_table_1,1585,6561,air products and chemicals /de/,exelon gen extexlaporte,0,0.000024,0.000012,0.986045,1.000000,7201 hamilton blvd,7201 hamilton blvd,2,0.000122,0.000122,9450.378101,0.095137,pa,pa,1,0.029409,0.029409,15.873789,1.738226,allentown,allentown,2,0.001003,0.001003,108.031428,6.314158,AR PRTKTS ANT XMKLS T,EKSLN JN EKSTKSLPRT,1,1585,0000002969,0000002969,air products & chemicals inc /de/,6561,6081
227094,20.402617,0.999999,__splink__input_table_0,__splink__input_table_1,1586,430,air products and chemicals,air products and chemicals,2,0.000037,0.000037,652179.111493,0.021160,1940 air products blvd,1940 air products blvd,2,0.000049,0.000049,9450.378101,0.237842,pa,pa,1,0.029409,0.029409,15.873789,1.738226,allentown,allentown,2,0.001003,0.001003,108.031428,6.314158,AR PRTKTS ANT XMKLS,AR PRTKTS ANT XMKLS,0,1586,0000002969,0000002969,"air products & chemicals, inc.",430,991
224504,5.308053,0.975380,__splink__input_table_0,__splink__input_table_1,1585,435,air products and chemicals /de/,air products,0,0.000024,0.000037,0.986045,1.000000,7201 hamilton blvd,7201 hamilton blvd,2,0.000122,0.000122,9450.378101,0.095137,pa,pa,1,0.029409,0.029409,15.873789,1.738226,allentown,allentown,2,0.001003,0.001003,108.031428,6.314158,AR PRTKTS ANT XMKLS T,AR PRTKTS,0,1585,0000002969,0000002969,air products & chemicals inc /de/,435,980
225982,5.308053,0.975380,__splink__input_table_0,__splink__input_table_1,1585,432,air products and chemicals /de/,air products energy enterprises,0,0.000024,0.000012,0.986045,1.000000,7201 hamilton blvd,7201 hamilton blvd,2,0.000122,0.000122,9450.378101,0.095137,pa,pa,1,0.029409,0.029409,15.873789,1.738226,allentown,allentown,2,0.001003,0.001003,108.031428,6.314158,AR PRTKTS ANT XMKLS T,AR PRTKTS ENRJ ENTRPRSS,0,1585,0000002969,0000002969,air products & chemicals inc /de/,432,353


In [140]:
validation_df = pd.read_csv("sec_eia_validation_set.csv", dtype={"central_index_key": str})

In [141]:
validation_df["central_index_key"] = validation_df["central_index_key"].str.zfill(10)

In [142]:
merged_df = validation_df.merge(
    preds_validation_df[["record_id_l", "record_id_r", "central_index_key", "utility_id_eia", "match_probability", "gamma_company_name_no_legal"]].drop_duplicates(keep="first"),
    how="left",
    on=["central_index_key", "utility_id_eia"],
    indicator=True
)

In [143]:
merged_df["predicted_match"] = merged_df["_merge"].map({"both": 1, "left_only": 0})

In [144]:
merged_df["predicted_match"] = merged_df["predicted_match"].where(
    (merged_df.match_probability > .95),
    0
)

In [145]:
merged_df.head(50)

Unnamed: 0,central_index_key,utility_id_eia,sec_company_name,eia_company_name,match,record_id_l,record_id_r,match_probability,gamma_company_name_no_legal,_merge,predicted_match
0,3153,195,alabama power co,,1,1701.0,478.0,1.0,2.0,both,1.0
1,1868941,58702,"fluence energy, inc.",Fluence,0,21792.0,6889.0,0.016529,0.0,both,0.0
2,41091,7140,georgia power co,,1,23416.0,7653.0,0.999997,2.0,both,1.0
3,22198,4062,columbus southern power co /oh/,Columbus Southern Power Co,1,13310.0,4281.0,0.999981,1.0,both,1.0
4,1326160,5416,duke energy corp,,1,17793.0,5564.0,0.927294,2.0,both,0.0
5,30371,54905,"duke energy carolinas, llc",Duke Energy Carolinas LLC,1,17790.0,5558.0,0.999987,2.0,both,1.0
6,869446,57140,berkshire realty co inc /de,Berkshire Wind Power Cooperative Corp,0,7449.0,1712.0,0.001912,0.0,both,0.0
7,92122,18195,southern co,southern co services inc,0,50962.0,17068.0,0.007216,0.0,both,0.0
8,92122,17650,southern co,Southern Power Co,0,50963.0,17089.0,0.034232,0.0,both,0.0
9,75488,14328,pacific gas & electric co,,1,41598.0,13933.0,0.999948,2.0,both,1.0


In [146]:
precision = precision_score(merged_df['match'], merged_df['predicted_match'])
recall = recall_score(merged_df['match'], merged_df['predicted_match'])
accuracy = accuracy_score(merged_df['match'], merged_df['predicted_match'])
# roc_auc = roc_auc_score(merged_df['match'], merged_df['match_probability'])

# Confusion matrix
conf_matrix = confusion_matrix(merged_df['match'], merged_df['predicted_match'])

In [147]:
precision, recall, accuracy

(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)

In [148]:
pd.DataFrame(
    conf_matrix,
    index=["Negative", "Positive"],
    columns=["Predicted Negative", "Predicted Positive"]
)

Unnamed: 0,Predicted Negative,Predicted Positive
Negative,6,2
Positive,3,13


In [149]:
incorrect_df = merged_df[merged_df.match != merged_df.predicted_match]

In [150]:
incorrect_df

Unnamed: 0,central_index_key,utility_id_eia,sec_company_name,eia_company_name,match,record_id_l,record_id_r,match_probability,gamma_company_name_no_legal,_merge,predicted_match
4,1326160,5416,duke energy corp,,1,17793.0,5564.0,0.927294,2.0,both,0.0
10,1031296,6526,firstenergy corp,FirstEnergy,0,21579.0,6776.0,0.999998,2.0,both,1.0
11,1031296,54776,firstenergy corp,FirstEnergy Nuclear Generation Corp,0,21579.0,6780.0,0.986543,0.0,both,1.0
13,1031296,32208,firstenergy corp,First Energy Corp,1,,,,,left_only,0.0
21,1032208,61296,sempra energy,Sempra Generation,1,49303.0,16270.0,0.559074,0.0,both,0.0


In [151]:
recs_to_view = []
for idx, rec in incorrect_df.iterrows():
    full_rec = preds_validation_df[
        (preds_validation_df.record_id_l == rec.record_id_l) & 
        (preds_validation_df.record_id_r == rec.record_id_r)
    ].squeeze()
    if full_rec.empty:
        continue
    recs_to_view.append(full_rec.to_dict())

In [152]:
linker.visualisations.waterfall_chart(recs_to_view, filter_nulls=True)

# Save good predictions
Make the predictions one to one. First, keep the highest probability EIA utility ID for each SEC company. Then, keep the highest probability SEC company for each EIA utility

In [153]:
one_to_one_preds = preds_validation_df[preds_validation_df.match_probability >= .95].sort_values(
    by="match_probability", ascending=False
).drop_duplicates(
    subset="sec_company_id", keep="first"
).drop_duplicates(
    subset="utility_id_eia", keep="first"
)

In [154]:
len(one_to_one_preds)

525