In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix
from splink import block_on, DuckDBAPI, Linker, SettingsCreator
from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks
import splink.comparison_library as cl
import splink.comparison_level_library as cll
from splink.exploratory import completeness_chart, profile_columns

from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (
    BLOCKING_RULES,
    MATCH_COLS,
    SHARED_COLS,
    address_comparison,
    city_comparison,
    company_name_comparison,
    deterministic_blocking_rules,
    state_comparison
)

# Inputs

### EIA

In [3]:
eia_df = pd.read_parquet("gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet")

In [4]:
len(eia_df)

20821

### SEC 10K Basic Info

In [5]:
sec_df = pd.read_pickle("/Users/katielamb/CatalystCoop/dagster_home/storage/core_sec_10k__parents_and_subsidiaries")

In [6]:
len(sec_df)

61026

# Preprocess SEC and EIA

Does it make more sense to do a direct match on company name after
the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?

In [7]:
sec_match_df = sec_df[sec_df.files_10k][SHARED_COLS]

In [8]:
eia_match_df = eia_df[SHARED_COLS]

In [9]:
sec_match_df.record_id.is_unique

True

In [10]:
eia_match_df.record_id.is_unique

True

In [11]:
# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair
sec_df.sec_company_id.is_unique

False

There should probably be no duplicate record, but if there are, keep the most recent version of that record.

In [12]:
eia_match_df.duplicated(subset=MATCH_COLS).value_counts()

False    20821
Name: count, dtype: int64

In [13]:
sec_match_df.duplicated(subset=MATCH_COLS).value_counts()

False    61026
Name: count, dtype: int64

In [14]:
sec_match_df = sec_match_df.sort_values(by="report_year", ascending=False).drop_duplicates(subset=MATCH_COLS, keep="first")
eia_match_df = eia_match_df.sort_values(by="report_year", ascending=False).drop_duplicates(subset=MATCH_COLS, keep="first")

# Link SEC and EIA

## Exploratory Analysis

In [15]:
db_api = DuckDBAPI()

In [16]:
completeness_chart(sec_match_df, db_api=db_api)

In [17]:
completeness_chart(eia_match_df, db_api=db_api)

In [18]:
profile_columns(sec_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

In [19]:
profile_columns(eia_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

## Blocking

In [20]:
# useful for experimenting with a new blocking rule
counts = count_comparisons_from_blocking_rule(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rule=BLOCKING_RULES[0],
    link_type="link_only",
    unique_id_column_name='record_id',
    db_api=db_api,
)

counts

{'number_of_comparisons_generated_pre_filter_conditions': 487944,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 487944,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',
 'link_type_join_condition': 'where l."source_dataset" || \'-__-\' || l."record_id" < r."source_dataset" || \'-__-\' || r."record_id" and l."source_dataset" != r."source_dataset"'}

In [21]:
result = n_largest_blocks(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rule=BLOCKING_RULES[0],
    link_type="link_only",
    db_api=db_api,
    n_largest=3
)

result.as_pandas_dataframe()

Unnamed: 0,key_0,count_l,count_r,block_count
0,INTR,445,76,33820
1,AMRK,851,38,32338
2,FRST,816,36,29376


In [22]:
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rules=BLOCKING_RULES,
    db_api=db_api,
    unique_id_column_name='record_id',
    link_type="link_only",
)

## Create Model

In [23]:
print(company_name_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'NameComparison' of "company_name_no_legal".
Similarity is assessed using the following ComparisonLevels:
    - 'company_name_no_legal is NULL' with SQL rule: "company_name_no_legal_l" IS NULL OR "company_name_no_legal_r" IS NULL
    - 'Exact match on company_name_no_legal' with SQL rule: "company_name_no_legal_l" = "company_name_no_legal_r"
    - 'Jaro-Winkler distance of company_name_no_legal >= 0.95' with SQL rule: jaro_winkler_similarity("company_name_no_legal_l", "company_name_no_legal_r") >= 0.95
    - 'All other comparisons' with SQL rule: ELSE



In [24]:
print(address_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'LevenshteinAtThresholds' of "street_address".
Similarity is assessed using the following ComparisonLevels:
    - 'street_address is NULL' with SQL rule: "street_address_l" IS NULL OR "street_address_r" IS NULL
    - 'Exact match on street_address' with SQL rule: "street_address_l" = "street_address_r"
    - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein("street_address_l", "street_address_r") <= 1
    - 'All other comparisons' with SQL rule: ELSE



In [25]:
print(state_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'ExactMatch' of "state".
Similarity is assessed using the following ComparisonLevels:
    - 'state is NULL' with SQL rule: "state_l" IS NULL OR "state_r" IS NULL
    - 'Exact match on state' with SQL rule: "state_l" = "state_r"
    - 'All other comparisons' with SQL rule: ELSE



In [26]:
print(city_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'NameComparison' of "city".
Similarity is assessed using the following ComparisonLevels:
    - 'city is NULL' with SQL rule: "city_l" IS NULL OR "city_r" IS NULL
    - 'Exact match on city' with SQL rule: "city_l" = "city_r"
    - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity("city_l", "city_r") >= 0.9
    - 'All other comparisons' with SQL rule: ELSE



In [27]:
settings = SettingsCreator(
    link_type="link_only",
    unique_id_column_name="record_id",
    comparisons=[
        company_name_comparison,
        address_comparison,
        state_comparison,
        city_comparison
    ],
    blocking_rules_to_generate_predictions=BLOCKING_RULES,
    retain_intermediate_calculation_columns=True,
)

linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())

In [28]:
linker.training.estimate_probability_two_random_records_match(deterministic_blocking_rules, recall=0.95)

Probability two random records match is estimated to be  2.37e-06.
This means that amongst all possible pairwise record comparisons, one in 421,176.28 are expected to match.  With 1,270,622,346 total possible comparisons, we expect a total of around 3,016.84 matching pairs


In [29]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e8)

----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - company_name_no_legal (no m values are trained).
    - street_address (no m values are trained).
    - state (no m values are trained).
    - city (no m values are trained).


In [30]:
training_blocking_rule = block_on("company_name", "company_name")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."company_name" = r."company_name") AND (l."company_name" = r."company_name")

Parameter estimates will be made for the following comparison(s):
    - company_name_no_legal
    - street_address
    - state
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Level Jaro-Winkler distance of company_name_no_legal >= 0.95 on comparison company_name_no_legal not observed in dataset, unable to train m value

Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value

Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`
Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match
Iteration 

In [31]:
training_blocking_rule = block_on("street_address", "street_address")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."street_address" = r."street_address") AND (l."street_address" = r."street_address")

Parameter estimates will be made for the following comparison(s):
    - company_name_no_legal
    - state
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - street_address

Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`
Iteration 2: Largest change in params was 0.472 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0399 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.044 in the m_probability of city, level `All other comparisons`
Iteration 5: Largest change in params was 0.0192 in probability_two_random_records_match
Iteration 6: Largest change in params was 

In [32]:
linker.visualisations.match_weights_chart()

In [33]:
linker.visualisations.m_u_parameters_chart()

In [420]:
# you could save the model weights like this
settings = linker.misc.save_model_to_json(
    "model_unsupervised_0.json", overwrite=True
)

## Make Predictions

In [34]:
df_predictions = linker.inference.predict()

Blocking time: 0.14 seconds
Predict time: 0.26 seconds


In [35]:
preds_df = df_predictions.as_pandas_dataframe()

In [36]:
preds_df.sort_values(by="match_probability")

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_no_legal_l,company_name_no_legal_r,gamma_company_name_no_legal,tf_company_name_no_legal_l,tf_company_name_no_legal_r,bf_company_name_no_legal,bf_tf_adj_company_name_no_legal,street_address_l,street_address_r,gamma_street_address,tf_street_address_l,tf_street_address_r,bf_street_address,bf_tf_adj_street_address,state_l,state_r,gamma_state,tf_state_l,tf_state_r,bf_state,bf_tf_adj_state,city_l,city_r,gamma_city,tf_city_l,tf_city_r,bf_city,bf_tf_adj_city,company_name_mphone_l,company_name_mphone_r,match_key
295287,-22.970354,1.216843e-07,__splink__input_table_0,__splink__input_table_1,56230,19078,union pacific,union electric,0,0.000049,0.000098,0.986046,1.000000,1416 dodge st,mc 1400,0,0.000049,0.000049,0.881656,1.000000,ne,mo,0,0.006455,0.010118,0.198718,1.000000,omaha,st louis,0,0.003448,0.002764,0.296663,1.000000,UNN PSFK,UNN ELKTRK,0
307206,-22.970354,1.216843e-07,__splink__input_table_0,__splink__input_table_1,29764,9337,international lease finance,international paper riegel,0,0.000037,0.000012,0.986046,1.000000,1999 ave of the stars,6400 poplar ave,0,0.000110,0.000061,0.881656,1.000000,ca,tn,0,0.157960,0.010622,0.198718,1.000000,los angeles,memphis,0,0.008107,0.001357,0.296663,1.000000,INTRNXNL LS FNNS,INTRNXNL PPR RJL,0
307205,-22.970354,1.216843e-07,__splink__input_table_0,__splink__input_table_1,29818,9337,international speedway,international paper riegel,0,0.000037,0.000012,0.986046,1.000000,1801 w international speedway blvd,6400 poplar ave,0,0.000012,0.000061,0.881656,1.000000,fl,tn,0,0.048477,0.010622,0.198718,1.000000,daytona beach,memphis,0,0.000245,0.001357,0.296663,1.000000,INTRNXNL SPTW,INTRNXNL PPR RJL,0
307204,-22.970354,1.216843e-07,__splink__input_table_0,__splink__input_table_1,59433,20092,west penn funding,west line solar,0,0.000024,0.000012,0.986046,1.000000,2325b2 renaissance dr,2180 south 1300 east,0,0.000012,0.000110,0.881656,1.000000,nv,ut,0,0.020458,0.010549,0.198718,1.000000,las vegas,salt lake city,0,0.010724,0.005772,0.296663,1.000000,WST PN FNTNK,WST LN SLR,0
307203,-22.970354,1.216843e-07,__splink__input_table_0,__splink__input_table_1,39648,12908,north country financial,north american energy services,0,0.000024,0.000110,0.986046,1.000000,3530 north country dr,1070 erie ave,0,0.000024,0.000037,0.881656,1.000000,mi,ny,0,0.015147,0.120228,0.198718,1.000000,traverse city,north tonawanda,0,0.000269,0.000049,0.296663,1.000000,NR0 KNTR FNNXL,NR0 AMRKN ENRJ SRFSS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163815,27.519613,1.000000e+00,__splink__input_table_0,__splink__input_table_1,39816,13109,northwestern public service,northwestern public service,2,0.000073,0.000073,477874.511191,0.014439,33 third st se,33 third st se,2,0.000037,0.000037,9888.266177,0.303079,sd,sd,1,0.001930,0.001930,15.866015,26.495963,huron,huron,2,0.000073,0.000073,103.554689,90.023441,NR0WSTRN PBLK SRFS,NR0WSTRN PBLK SRFS,0
241593,27.526521,1.000000e+00,__splink__input_table_0,__splink__input_table_1,24650,8047,green mountain power,green mountain power,2,0.000037,0.000037,477874.511191,0.028877,163 acorn ln,163 acorn ln,2,0.000037,0.000037,9888.266177,0.303079,vt,vt,1,0.001537,0.001537,15.866015,33.278930,colchester,colchester,2,0.000183,0.000183,103.554689,36.009376,KRN MNTN PWR,KRN MNTN PWR,0
165487,27.757345,1.000000e+00,__splink__input_table_0,__splink__input_table_1,58842,19906,wausau paper mills,wausau paper mills,2,0.000024,0.000024,477874.511191,0.043316,one clarks is,one clarks is,2,0.000024,0.000024,9888.266177,0.454618,wi,wi,1,0.008840,0.008840,15.866015,5.785628,wausau,wausau,2,0.000061,0.000061,103.554689,108.028129,WS PPR MLS,WS PPR MLS,0
340414,27.884373,1.000000e+00,__splink__input_table_0,__splink__input_table_1,51567,17450,st joseph light and power,st joseph light and power,2,0.000024,0.000024,477874.511191,0.043316,520 francis st,520 francis st,2,0.000024,0.000024,9888.266177,0.454618,mo,mo,1,0.010118,0.010118,15.866015,5.054515,st joseph,st joseph,2,0.000049,0.000049,103.554689,135.035162,ST JSF LT ANT PWR,ST JSF LT ANT PWR,0


In [37]:
# join on utility_id_eia and CIK
preds_validation_df = preds_df.merge(sec_df[["record_id", "sec_company_id", "central_index_key", "company_name_raw"]],
                                     how="left",
                                     left_on="record_id_l",
                                     right_on="record_id")

In [38]:
preds_validation_df = preds_validation_df.merge(eia_df[["record_id", "utility_id_eia"]],
                                                how="left",
                                                left_on="record_id_r",
                                                right_on="record_id")

In [39]:
preds_validation_df = preds_validation_df.sort_values(
    by=["sec_company_id", "utility_id_eia", "match_probability"], ascending=False
).drop_duplicates(subset=["sec_company_id", "utility_id_eia"], keep="first")

In [40]:
preds_validation_df[preds_validation_df.match_probability > .9].head(3)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_no_legal_l,company_name_no_legal_r,gamma_company_name_no_legal,tf_company_name_no_legal_l,tf_company_name_no_legal_r,bf_company_name_no_legal,bf_tf_adj_company_name_no_legal,street_address_l,street_address_r,gamma_street_address,tf_street_address_l,tf_street_address_r,bf_street_address,bf_tf_adj_street_address,state_l,state_r,gamma_state,tf_state_l,tf_state_r,bf_state,bf_tf_adj_state,city_l,city_r,gamma_city,tf_city_l,tf_city_r,bf_city,bf_tf_adj_city,company_name_mphone_l,company_name_mphone_r,match_key,record_id_x,sec_company_id,central_index_key,company_name_raw,record_id_y,utility_id_eia
218797,3.824584,0.934073,__splink__input_table_0,__splink__input_table_1,14692,6293,crane,entergy nuclear power marketing,0,1.2e-05,1.2e-05,0.986046,1.0,100 first stamford pl,100 first stamford pl,2,0.000122,0.000122,9888.266177,0.090924,ct,ct,1,0.020876,0.020876,15.866015,2.449862,stamford,stamford,2,0.00395,0.00395,103.554689,1.672262,KRN,ENTRJ NKLR PWR MRKTNK,1,14692,1944013,1944013,crane co,6293,55243
220036,4.619994,0.960922,__splink__input_table_0,__splink__input_table_1,17752,5535,dte electric securitization funding i,dte sustainable generation,0,1.2e-05,1.2e-05,0.986046,1.0,one energy plz,one energy plz,2,0.00033,0.00033,9888.266177,0.033675,mi,mi,1,0.015147,0.015147,15.866015,3.376515,detroit,detroit,2,0.001162,0.001162,103.554689,5.685691,TT ELKTRK SKRTSXN FNTNK I,TT SSTNBL JNRXN,1,17752,1876068,1876068,dte electric securitization funding i llc,5535,64331
481032,4.619994,0.960922,__splink__input_table_0,__splink__input_table_1,17752,5522,dte electric securitization funding i,dte electric,0,1.2e-05,3.7e-05,0.986046,1.0,one energy plz,one energy plz,2,0.00033,0.00033,9888.266177,0.033675,mi,mi,1,0.015147,0.015147,15.866015,3.376515,detroit,detroit,2,0.001162,0.001162,103.554689,5.685691,TT ELKTRK SKRTSXN FNTNK I,TT ELKTRK,0,17752,1876068,1876068,dte electric securitization funding i llc,5522,5109


Figure out what to do about this validation CSV, maybe it should be part of package data? It's not a very big sample size and it's imperfect so the metrics gained from it are should be taken with a grain of salt.

In [41]:
validation_df = pd.read_csv("sec_eia_validation_set.csv", dtype={"central_index_key": str})

In [42]:
validation_df["central_index_key"] = validation_df["central_index_key"].str.zfill(10)

In [43]:
merged_df = validation_df.merge(
    preds_validation_df[["record_id_l", "record_id_r", "central_index_key", "utility_id_eia", "match_probability", "gamma_company_name_no_legal"]].drop_duplicates(keep="first"),
    how="left",
    on=["central_index_key", "utility_id_eia"],
    indicator=True
)

In [44]:
merged_df["predicted_match"] = merged_df["_merge"].map({"both": 1, "left_only": 0})

In [45]:
merged_df["predicted_match"] = merged_df["predicted_match"].where(
    (merged_df.match_probability > .95),
    0
)

In [46]:
merged_df.head(50)

Unnamed: 0,central_index_key,utility_id_eia,sec_company_name,eia_company_name,match,record_id_l,record_id_r,match_probability,gamma_company_name_no_legal,_merge,predicted_match
0,3153,195,alabama power co,,1,1701.0,478.0,1.0,2.0,both,1.0
1,1868941,58702,"fluence energy, inc.",Fluence,0,21792.0,6889.0,0.016529,0.0,both,0.0
2,41091,7140,georgia power co,,1,23416.0,7653.0,0.999997,2.0,both,1.0
3,22198,4062,columbus southern power co /oh/,Columbus Southern Power Co,1,13310.0,4281.0,0.999984,1.0,both,1.0
4,1326160,5416,duke energy corp,,1,17793.0,5564.0,0.927294,2.0,both,0.0
5,30371,54905,"duke energy carolinas, llc",Duke Energy Carolinas LLC,1,17790.0,5558.0,0.999987,2.0,both,1.0
6,869446,57140,berkshire realty co inc /de,Berkshire Wind Power Cooperative Corp,0,7449.0,1712.0,0.001912,0.0,both,0.0
7,92122,18195,southern co,southern co services inc,0,50964.0,17068.0,0.007216,0.0,both,0.0
8,92122,17650,southern co,Southern Power Co,0,50963.0,17089.0,0.034232,0.0,both,0.0
9,75488,14328,pacific gas & electric co,,1,41598.0,13933.0,0.999948,2.0,both,1.0


In [47]:
precision = precision_score(merged_df['match'], merged_df['predicted_match'])
recall = recall_score(merged_df['match'], merged_df['predicted_match'])
accuracy = accuracy_score(merged_df['match'], merged_df['predicted_match'])
# roc_auc = roc_auc_score(merged_df['match'], merged_df['match_probability'])

# Confusion matrix
conf_matrix = confusion_matrix(merged_df['match'], merged_df['predicted_match'])

In [48]:
precision, recall, accuracy

(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)

In [49]:
pd.DataFrame(
    conf_matrix,
    index=["Negative", "Positive"],
    columns=["Predicted Negative", "Predicted Positive"]
)

Unnamed: 0,Predicted Negative,Predicted Positive
Negative,6,2
Positive,3,13


In [50]:
incorrect_df = merged_df[merged_df.match != merged_df.predicted_match]

In [51]:
incorrect_df

Unnamed: 0,central_index_key,utility_id_eia,sec_company_name,eia_company_name,match,record_id_l,record_id_r,match_probability,gamma_company_name_no_legal,_merge,predicted_match
4,1326160,5416,duke energy corp,,1,17793.0,5564.0,0.927294,2.0,both,0.0
10,1031296,6526,firstenergy corp,FirstEnergy,0,21579.0,6776.0,0.999998,2.0,both,1.0
11,1031296,54776,firstenergy corp,FirstEnergy Nuclear Generation Corp,0,21579.0,6780.0,0.986543,0.0,both,1.0
13,1031296,32208,firstenergy corp,First Energy Corp,1,,,,,left_only,0.0
21,1032208,61296,sempra energy,Sempra Generation,1,49303.0,16270.0,0.559072,0.0,both,0.0


In [52]:
recs_to_view = []
for idx, rec in incorrect_df.iterrows():
    full_rec = preds_validation_df[
        (preds_validation_df.record_id_l == rec.record_id_l) & 
        (preds_validation_df.record_id_r == rec.record_id_r)
    ].squeeze()
    if full_rec.empty:
        continue
    recs_to_view.append(full_rec.to_dict())

In [152]:
linker.visualisations.waterfall_chart(recs_to_view, filter_nulls=True)

# Save good predictions
Make the predictions one to one. First, keep the highest probability EIA utility ID for each SEC company. Then, keep the highest probability SEC company for each EIA utility

In [53]:
one_to_one_preds = preds_validation_df[preds_validation_df.match_probability >= .95].sort_values(
    by="match_probability", ascending=False
).drop_duplicates(
    subset="sec_company_id", keep="first"
).drop_duplicates(
    subset="utility_id_eia", keep="first"
)

In [54]:
len(one_to_one_preds)

534

# Add `utility_id_eia` onto the SEC table to create output table

In [55]:
one_to_one_preds

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_no_legal_l,company_name_no_legal_r,gamma_company_name_no_legal,tf_company_name_no_legal_l,tf_company_name_no_legal_r,bf_company_name_no_legal,bf_tf_adj_company_name_no_legal,street_address_l,street_address_r,gamma_street_address,tf_street_address_l,tf_street_address_r,bf_street_address,bf_tf_adj_street_address,state_l,state_r,gamma_state,tf_state_l,tf_state_r,bf_state,bf_tf_adj_state,city_l,city_r,gamma_city,tf_city_l,tf_city_r,bf_city,bf_tf_adj_city,company_name_mphone_l,company_name_mphone_r,match_key,record_id_x,sec_company_id,central_index_key,company_name_raw,record_id_y,utility_id_eia
274760,29.211020,1.000000,__splink__input_table_0,__splink__input_table_1,20588,6741,fibermark,fibermark,2,0.000037,0.000037,477874.511191,0.028877,161 wellington rd,161 wellington rd,2,0.000024,0.000024,9888.266177,0.454618,vt,vt,1,0.001537,0.001537,15.866015,33.278930,brattleboro,brattleboro,2,0.000086,0.000086,103.554689,77.162949,FBRMRK,FBRMRK,0,20588,0000887591,0000887591,fibermark inc,6741,6309
340414,27.884373,1.000000,__splink__input_table_0,__splink__input_table_1,51567,17450,st joseph light and power,st joseph light and power,2,0.000024,0.000024,477874.511191,0.043316,520 francis st,520 francis st,2,0.000024,0.000024,9888.266177,0.454618,mo,mo,1,0.010118,0.010118,15.866015,5.054515,st joseph,st joseph,2,0.000049,0.000049,103.554689,135.035162,ST JSF LT ANT PWR,ST JSF LT ANT PWR,0,51567,0000086251,0000086251,st joseph light & power co,17450,17881
165487,27.757345,1.000000,__splink__input_table_0,__splink__input_table_1,58842,19906,wausau paper mills,wausau paper mills,2,0.000024,0.000024,477874.511191,0.043316,one clarks is,one clarks is,2,0.000024,0.000024,9888.266177,0.454618,wi,wi,1,0.008840,0.008840,15.866015,5.785628,wausau,wausau,2,0.000061,0.000061,103.554689,108.028129,WS PPR MLS,WS PPR MLS,0,58842,0000105076,0000105076,wausau paper mills co,19906,20190
241593,27.526521,1.000000,__splink__input_table_0,__splink__input_table_1,24650,8047,green mountain power,green mountain power,2,0.000037,0.000037,477874.511191,0.028877,163 acorn ln,163 acorn ln,2,0.000037,0.000037,9888.266177,0.303079,vt,vt,1,0.001537,0.001537,15.866015,33.278930,colchester,colchester,2,0.000183,0.000183,103.554689,36.009376,KRN MNTN PWR,KRN MNTN PWR,0,24650,0000043704,0000043704,green mountain power corp,8047,7601
163815,27.519613,1.000000,__splink__input_table_0,__splink__input_table_1,39816,13109,northwestern public service,northwestern public service,2,0.000073,0.000073,477874.511191,0.014439,33 third st se,33 third st se,2,0.000037,0.000037,9888.266177,0.303079,sd,sd,1,0.001930,0.001930,15.866015,26.495963,huron,huron,2,0.000073,0.000073,103.554689,90.023441,NR0WSTRN PBLK SRFS,NR0WSTRN PBLK SRFS,0,39816,0000073088,0000073088,northwestern public service co,13109,13809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218776,4.337127,0.952856,__splink__input_table_0,__splink__input_table_1,32941,17608,lifestance health group,stirling energy systems solar one,0,0.000012,0.000037,0.986046,1.000000,4800 n scottsdale rd,4800 n scottsdale rd,2,0.000110,0.000110,9888.266177,0.101026,az,az,1,0.012872,0.012872,15.866015,3.973129,scottsdale,scottsdale,2,0.004989,0.004989,103.554689,1.323874,LFSTNS HL0 KRP,STRLNK ENRJ SSTMS SLR ON,1,32941,0001845257,0001845257,"lifestance health group, inc.",17608,56166
145930,4.321967,0.952382,__splink__input_table_0,__splink__input_table_1,28535,9121,imperial holly,imperial holly,2,0.000024,0.000024,477874.511191,0.043316,one imperial sq ste 200,p o box 9,0,0.000024,0.000159,0.881656,1.000000,tx,tx,1,0.079841,0.079841,15.866015,0.640571,sugar land,sugarland,1,0.000355,0.000098,45.415672,1.000000,IMPRL HL,IMPRL HL,0,28535,0000831327,0000831327,imperial holly corp,9121,9223
6194,4.272164,0.950792,__splink__input_table_0,__splink__input_table_1,32403,16195,lease investment flight trust,se solar trust v c,0,0.000012,0.000012,0.986046,1.000000,1100 north market st,1100 north market st,2,0.000061,0.000061,9888.266177,0.181847,de,de,1,0.011717,0.011717,15.866015,4.365022,wilmington,wilmington,2,0.010321,0.010321,103.554689,0.639977,LS INFSTMNT FLT TRST,S SLR TRST F K,1,32403,0001158389,0001158389,lease investment flight trust,16195,56900
1135,4.272164,0.950792,__splink__input_table_0,__splink__input_table_1,22415,7605,fresenius kabi pharmaceuticals holding,genon sabine delaware,0,0.000012,0.000012,0.986046,1.000000,2711 centerville rd,2711 centerville rd,2,0.000061,0.000061,9888.266177,0.181847,de,de,1,0.011717,0.011717,15.866015,4.365022,wilmington,wilmington,2,0.010321,0.010321,103.554689,0.639977,FRSNS KB FRMSTKLS HLTNK,JNN SBN TLWR,1,22415,0001439449,0001439449,"fresenius kabi pharmaceuticals holding, inc.",7605,56922


In [320]:
one_to_one_preds.to_parquet("one_to_one_preds.parquet")