In [1]:
%load_ext autoreload
%autoreload 3

In [262]:
import json
import os
from pathlib import Path

import pandas as pd
from splink import block_on, DuckDBAPI, Linker, SettingsCreator
from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks
import splink.comparison_library as cl
from splink.exploratory import completeness_chart, profile_columns
from upath import UPath

from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df

# Inputs

Questions:
* What's the best way to dagsterize this to get EIA data from PUDL?

### EIA

In [12]:
raw_eia_df = pd.read_parquet("s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet")

In [13]:
mergers_df = pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet")

In [14]:
raw_eia861_df = pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet")

In [104]:
eia_df = raw_eia_df.copy()

In [16]:
harvested_df = pd.concat([
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
])

In [17]:
eia861_df = raw_eia861_df.merge(harvested_df, on=["report_date", "utility_id_eia"], how="left").drop_duplicates(subset=["report_date", "utility_id_eia"])

In [18]:
mergers_df = mergers_df[mergers_df["new_parent"].notna()]
eia861_df = eia861_df.merge(mergers_df[["report_date", "new_parent", "merge_address", "merge_city", "merge_state"]], 
                how="left", 
                left_on=["report_date", "utility_name_eia"],
                right_on=["report_date", "new_parent"]
               )
eia861_df = eia861_df.rename(columns={"merge_address": "street_address", "merge_city": "city"})
eia861_df = eia861_df.groupby(["report_date", "utility_id_eia"]).first().reset_index()

In [20]:
eia861_df["state"] = eia861_df["state"].where(eia861_df["merge_state"].isnull(), eia861_df["merge_state"])
eia861_df = eia861_df.drop(columns=["new_parent", "merge_state"])

In [105]:
eia_df = pd.concat([eia_df, eia861_df])
eia_df = eia_df.drop_duplicates(subset=["utility_id_eia", "report_date"], keep="first")
# not sure at what point this stops being a datetime
eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]")
# there are nulls from non harvested 861 utilities
eia_df = eia_df.dropna(subset="utility_name_eia")

### SEC 10K Basic Info

In [22]:
sec_path = UPath("gs://sec10k-outputs/v2/basic_10k_company_info")

In [28]:
raw_sec_df = pd.DataFrame()
for file in sec_path.iterdir():
    if file.name.split(".")[-1] == "parquet":
        raw_sec_df = pd.concat([raw_sec_df, pd.read_parquet(sec_path / file.name)])

In [29]:
raw_sec_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,value
filename,filer_count,block,block_count,key,Unnamed: 5_level_1
edgar/data/100240/0000950144-94-000787.txt,0,company_data,0,company_conformed_name,turner broadcasting system inc
edgar/data/100240/0000950144-94-000787.txt,0,company_data,0,central_index_key,0000100240
edgar/data/100240/0000950144-94-000787.txt,0,company_data,0,standard_industrial_classification,4833
edgar/data/100240/0000950144-94-000787.txt,0,company_data,0,irs_number,580950695
edgar/data/100240/0000950144-94-000787.txt,0,company_data,0,state_of_incorporation,ga
...,...,...,...,...,...
edgar/data/936528/0000936528-23-000207.txt,0,former_company,0,date_of_name_change,20230928
edgar/data/936528/0000936528-23-000207.txt,0,former_company,1,former_conformed_name,wafd inc
edgar/data/936528/0000936528-23-000207.txt,0,former_company,1,date_of_name_change,20230927
edgar/data/936528/0000936528-23-000207.txt,0,former_company,2,former_conformed_name,washington federal inc


In [87]:
raw_sec_df = raw_sec_df.reset_index().pivot_table(values="value", index="filename", columns="key", aggfunc="first")
raw_sec_df.columns.name = None

### Ex. 21

In [58]:
ex21_path = UPath("gs://sec10k-outputs/v2/ex21_company_ownership_info")

In [91]:
raw_ex21_df = pd.DataFrame()
for file in ex21_path.iterdir():
    if file.name.split(".")[-1] == "parquet":
        year_quarter_df = pd.read_parquet(ex21_path / file.name)
        report_year = file.name[:4]
        year_quarter_df.loc[:, "report_year"] = report_year
        year_quarter_df.loc[:, "report_year"] = pd.to_datetime(year_quarter_df["report_year"], format="%Y").dt.year
        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])

# Preprocessing

In [153]:
# cleaning on both sides
sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)

In [157]:
ex21_clean_df = prepare_ex21_df(raw_ex21_df)

  df = df.fillna(np.nan)


In [224]:
eia_clean_df = prepare_eia_df(eia_df)

In [228]:
SHARED_COLS = [
    "record_id",
    "report_date",
    "report_year",
    "company_name",
    "street_address",
    "street_address_2",
    "city",
    "state",  # could use state of incorporation from SEC
    "zip_code",
    "phone_number",
    "company_name_mphone"
]

In [None]:
# strip legal terms and then make a list column from company name
# use this for blocking and comnparison levels
eia_match_df["company_name_mphone_list"] = eia_match_df["company_name_mphone"].str.split()

In [None]:
# create list column for address information as well

In [158]:
sec_clean_df[SHARED_COLS].head(3)

Unnamed: 0,report_date,report_year,company_name,street_address,street_address_2,city,state,zip_code,phone_number
0,2000-03-30,2000,meta group incorporated,208 harbor dr,,stamford,ct,06912-0061,2039736700
1,2001-04-02,2001,meta group incorporated,208 harbor dr,,stamford,ct,06912-0061,2039736700
2,2002-04-01,2002,meta group incorporated,208 harbor dr,,stamford,ct,06912-0061,2039736700


In [159]:
eia_clean_df[~eia_match_df.street_address.isnull()][SHARED_COLS].head(3)

Unnamed: 0,report_date,report_year,company_name,street_address,street_address_2,city,state,zip_code,phone_number
33,2023-01-01,2023,desert willow energy storage,100 bayview circle,,newport beach,ca,,
35,2023-01-01,2023,portage solar plant,n8917,,portage,wi,53901.0,
37,2023-01-01,2023,nsf energy one limited liability company,1241 university ave,,rochester,ny,14607.0,


In [160]:
ex21_clean_df.head(3)

Unnamed: 0,record_id,id,company_name_raw,loc_of_incorporation,own_per,report_year,company_name,company_name_mphone
0,0,14060-0000916131-94-000015,brenton bank and trust company,iowa,,1994,brenton bank and trust company,BRNTN BNK ANT TRST KMPN
1,1,14060-0000916131-94-000015,adel,iowa,,1994,adel,ATL
2,2,14060-0000916131-94-000015,"brenton savings bank, fsb united states","ames, iowa",,1994,"brenton savings bank, fsb united states",BRNTN SFNKS BNK FSB UNTT STTS


In [229]:
SHARED_COLS

['record_id',
 'report_date',
 'report_year',
 'company_name',
 'street_address',
 'street_address_2',
 'city',
 'state',
 'zip_code',
 'phone_number',
 'company_name_mphone']

In [231]:
eia_match_df = eia_clean_df[SHARED_COLS]

In [232]:
sec_match_df = sec_clean_df[SHARED_COLS]

# Link in Ex. 21 records

In [165]:
# run the Ex.21 to SEC model
filepath = Path("../sec_ex21_model_settings/2023_model.json")
with open(filepath, 'r') as file:
    sec_ex21_settings = json.load(file)

In [192]:
sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][["record_id", "report_year", "company_name", "loc_of_incorporation", "company_name_mphone"]]

In [193]:
ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][["record_id", "report_year", "company_name", "loc_of_incorporation", "company_name_mphone"]]

In [194]:
len(sec_test_df)

14125

In [195]:
len(ex21_test_df)

233101

In [196]:
sec_test_df.head(3)

Unnamed: 0,record_id,report_year,company_name,loc_of_incorporation,company_name_mphone
23,23,2016,nicholas financial incorporated,florida,NXLS FNNXL INKRPRTT
24,24,2017,nicholas financial incorporated,florida,NXLS FNNXL INKRPRTT
68,68,2016,sandisk corporation,delaware,SNTSK KRPRXN


In [197]:
ex21_test_df.head(3)

Unnamed: 0,record_id,report_year,company_name,loc_of_incorporation,company_name_mphone
2832746,0,2016,"capstone turbine singapore pte., limited",singapore,KPSTN TRBN SNKPR PT LMTT
2832747,1,2016,"capstone turbine international, incorporated",delaware,KPSTN TRBN INTRNXNL INKRPRTT
2832748,2,2016,"capstone turbine financial services, limited l...",delaware,KPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN


In [198]:
# can we just load this linker and make predictions? what happens with blocking?
sec_ex21_linker = Linker([sec_test_df, ex21_test_df], sec_ex21_settings, db_api=DuckDBAPI())

In [199]:
sec_ex21_preds = sec_ex21_linker.inference.predict(threshold_match_probability=0.6)

Blocking time: 0.44 seconds


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Predict time: 115.79 seconds


In [200]:
sec_ex21_preds_df = sec_ex21_preds.as_pandas_dataframe()

In [201]:
# TODO: this needs to be improved, maybe just do a fuzzy match on string name?
sec_ex21_preds_df

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_l,company_name_r,gamma_company_name,tf_company_name_l,tf_company_name_r,bf_company_name,bf_tf_adj_company_name,loc_of_incorporation_l,loc_of_incorporation_r,gamma_loc_of_incorporation,tf_loc_of_incorporation_l,tf_loc_of_incorporation_r,bf_loc_of_incorporation,bf_tf_adj_loc_of_incorporation,company_name_mphone_l,company_name_mphone_r,report_year_l,report_year_r
0,11.726954,0.999705,__splink__input_table_0,__splink__input_table_1,95551,5939,pendrell corporation,pentzer corporation,3,0.000008,0.000004,35295.437753,1.0,washington,washington,3,0.003427,0.003427,2.321780,60.034545,PNTRL KRPRXN,PNTSR KRPRXN,2017,2017
1,0.981720,0.663845,__splink__input_table_0,__splink__input_table_1,80041,1485,"spok holdings, incorporated","autohaus holdings, incorporated",2,0.000008,0.000004,2126.980572,1.0,delaware,delaware,3,0.354513,0.354513,2.321780,0.580388,SPK HLTNKS INKRPRTT,ATHS HLTNKS INKRPRTT,2017,2017
2,4.604002,0.960504,__splink__input_table_0,__splink__input_table_1,72068,2731,ashford hospitality trust incorporated,"ashford hospitality trust, incorporated",3,0.000008,0.000004,35295.437753,1.0,maryland,,-1,0.010087,,1.000000,1.000000,AXFRT HSPTLT TRST INKRPRTT,AXFRT HSPTLT TRST INKRPRTT,2017,2017
3,3.901062,0.937263,__splink__input_table_0,__splink__input_table_1,58652,1115,"tx holdings, incorporated","tex holdings, incorporated",3,0.000008,0.000004,35295.437753,1.0,georgia,delaware,0,0.005596,0.354513,0.614319,1.000000,TKS HLTNKS INKRPRTT,TKS HLTNKS INKRPRTT,2017,2017
4,4.604002,0.960504,__splink__input_table_0,__splink__input_table_1,82946,1757,"pharma bio serv, incorporated","pharma bio serv us, incorporated",3,0.000008,0.000004,35295.437753,1.0,,delaware,-1,,0.354513,1.000000,1.000000,FRM B SRF INKRPRTT,FRM B SRF US INKRPRTT,2017,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9343,0.981720,0.663845,__splink__input_table_0,__splink__input_table_1,248688,1135,transenterix incorporated,"trane brands, incorporated",2,0.000008,0.000004,2126.980572,1.0,delaware,delaware,3,0.354513,0.354513,2.321780,0.580388,TRNSNTRKS INKRPRTT,TRN BRNTS INKRPRTT,2017,2017
9344,3.901062,0.937263,__splink__input_table_0,__splink__input_table_1,260283,3506,cree incorporated,j.crew incorporated,3,0.000008,0.000004,35295.437753,1.0,north carolina,delaware,0,0.004926,0.354513,0.614319,1.000000,KR INKRPRTT,JKR INKRPRTT,2017,2017
9345,0.981720,0.663845,__splink__input_table_0,__splink__input_table_1,232258,3973,"applied minerals, incorporated","applied materials spv2, incorporated",2,0.000008,0.000008,2126.980572,1.0,delaware,delaware,3,0.354513,0.354513,2.321780,0.580388,APLT MNRLS INKRPRTT,APLT MTRLS SPF INKRPRTT,2017,2016
9346,3.901062,0.937263,__splink__input_table_0,__splink__input_table_1,232258,3970,"applied minerals, incorporated","applied materials japan, incorporated",3,0.000008,0.000008,35295.437753,1.0,delaware,japan,0,0.354513,0.005795,0.614319,1.000000,APLT MNRLS INKRPRTT,APLT MTRLS JPN INKRPRTT,2017,2016


In [34]:
# add the Ex. 21 subsidiaries that don't get a matching CIK to the SEC side
# run on all the data
# save the mapping of subsidiaries that are greater than a certain threshold (unclear why the blocking isn't working)
# get the subsidiaries that are less than a certain threshold
# transform them to have columns that match with the SEC df
# add them to the SEC side

# Link SEC and EIA

## Exploratory Analysis

In [205]:
db_api = DuckDBAPI()

In [233]:
completeness_chart(sec_match_df, db_api=db_api)

In [234]:
completeness_chart(eia_match_df, db_api=db_api)

In [209]:
# could sub in zip code for street address?
match_cols = ["company_name", "state", "city", "street_address", "zip_code"]

In [210]:
profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

In [211]:
profile_columns(eia_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

## Blocking

In [300]:
br0 = "l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)"
br1 = "l.report_year = r.report_year and l.street_address = r.street_address"
# br2 = "l.report_year = r.report_year and substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city"
br4 = "l.report_year = r.report_year and l.phone_number = r.phone_number"

In [257]:
counts = count_comparisons_from_blocking_rule(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rule=br0,
    link_type="link_only",
    unique_id_column_name='record_id',
    db_api=db_api,
)

counts

{'number_of_comparisons_generated_pre_filter_conditions': 618634,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 618634,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',
 'link_type_join_condition': 'where l."source_dataset" || \'-__-\' || l."record_id" < r."source_dataset" || \'-__-\' || r."record_id" and l."source_dataset" != r."source_dataset"'}

In [259]:
result = n_largest_blocks(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rule=br3,
    link_type="link_only",
    db_api=db_api,
    n_largest=3
)

result.as_pandas_dataframe()

Unnamed: 0,key_0,key_1,key_2,count_l,count_r,block_count
0,2023,boston,2110,113,134,15142
1,2022,boston,2110,116,110,12760
2,2021,boston,2110,113,88,9944


In [302]:
blocking_rules_for_analysis = [
    br0, br1
]


cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[sec_match_df, eia_match_df],
    blocking_rules=blocking_rules_for_analysis,
    db_api=db_api,
    unique_id_column_name='record_id',
    link_type="link_only",
)

## Create Model

In [382]:
# company_name_comparison = cl.NameComparison(
#     "company_name",
    # dmeta_col_name="company_name_mphone_list" # this was breaking it for some reason
# )
company_name_comparison = cl.JaccardAtThresholds(
     "company_name",
    # dmeta_col_name="company_name_mphone_list" # this was breaking it for some reason
)
print(company_name_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'JaccardAtThresholds' of "company_name".
Similarity is assessed using the following ComparisonLevels:
    - 'company_name is NULL' with SQL rule: "company_name_l" IS NULL OR "company_name_r" IS NULL
    - 'Exact match on company_name' with SQL rule: "company_name_l" = "company_name_r"
    - 'Jaccard distance of 'company_name >= 0.9'' with SQL rule: jaccard("company_name_l", "company_name_r") >= 0.9
    - 'Jaccard distance of 'company_name >= 0.7'' with SQL rule: jaccard("company_name_l", "company_name_r") >= 0.7
    - 'All other comparisons' with SQL rule: ELSE



In [373]:
address_comparison = cl.LevenshteinAtThresholds(
    "street_address",
    # size_threshold_or_thresholds=[1,2,3]
)
print(address_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'LevenshteinAtThresholds' of "street_address".
Similarity is assessed using the following ComparisonLevels:
    - 'street_address is NULL' with SQL rule: "street_address_l" IS NULL OR "street_address_r" IS NULL
    - 'Exact match on street_address' with SQL rule: "street_address_l" = "street_address_r"
    - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein("street_address_l", "street_address_r") <= 1
    - 'Levenshtein distance of street_address <= 2' with SQL rule: levenshtein("street_address_l", "street_address_r") <= 2
    - 'All other comparisons' with SQL rule: ELSE



In [267]:
zip_code_comparison = cl.ExactMatch("zip_code").configure(term_frequency_adjustments=True)

In [268]:
state_comparison = cl.ExactMatch("state").configure(term_frequency_adjustments=True)

In [269]:
city_comparison = cl.NameComparison(
    "city",
    jaro_winkler_thresholds=[0.9]
    # dmeta_col_name="company_name_mphone" # this was breaking it for some reason
)
print(city_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'NameComparison' of "city".
Similarity is assessed using the following ComparisonLevels:
    - 'city is NULL' with SQL rule: "city_l" IS NULL OR "city_r" IS NULL
    - 'Exact match on city' with SQL rule: "city_l" = "city_r"
    - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity("city_l", "city_r") >= 0.9
    - 'All other comparisons' with SQL rule: ELSE



In [383]:
settings = SettingsCreator(
    link_type="link_only",
    unique_id_column_name="record_id",
    comparisons=[
        company_name_comparison,
        address_comparison,
        zip_code_comparison,
        state_comparison,
        city_comparison
    ],
    blocking_rules_to_generate_predictions=[
        br0, br1
    ],
    retain_intermediate_calculation_columns=True,
)

linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())

In [384]:
deterministic_rules = [
    block_on("company_name", "company_name"),
    block_on("phone_number"),
    block_on("street_address"),
    "jaccard(r.company_name, l.company_name) >= .9 and l.city = r.city",
    "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city",
]

linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Probability two random records match is estimated to be  2.18e-05.
This means that amongst all possible pairwise record comparisons, one in 45,828.17 are expected to match.  With 40,620,617,120 total possible comparisons, we expect a total of around 886,367.78 matching pairs


In [385]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e7)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - company_name (no m values are trained).
    - street_address (no m values are trained).
    - zip_code (no m values are trained).
    - state (no m values are trained).
    - city (no m values are trained).


In [386]:
training_blocking_rule = block_on("company_name", "company_name")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."company_name" = r."company_name") AND (l."company_name" = r."company_name")

Parameter estimates will be made for the following comparison(s):
    - street_address
    - zip_code
    - state
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - company_name

Iteration 1: Largest change in params was 0.804 in the m_probability of street_address, level `All other comparisons`
Iteration 2: Largest change in params was 0.0737 in the m_probability of state, level `Exact match on state`
Iteration 3: Largest change in params was -0.039 in the m_probability of state, level `All other comparisons`
Iteration 4: Largest change in params was 0.021 in the m_probability of city, level `All other comparisons`
Iteration 5: Largest change in params was 0.00805 in the m_probability of city, level `All other comparisons`
Iter

In [387]:
training_blocking_rule = block_on("street_address", "street_address")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."street_address" = r."street_address") AND (l."street_address" = r."street_address")

Parameter estimates will be made for the following comparison(s):
    - company_name
    - zip_code
    - state
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - street_address

Iteration 1: Largest change in params was -0.929 in the m_probability of company_name, level `Exact match on company_name`
Iteration 2: Largest change in params was 0.0355 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.00843 in the m_probability of state, level `All other comparisons`
Iteration 4: Largest change in params was -0.00612 in the m_probability of state, level `Exact match on state`
Iteration 5: Largest change in params was -0.00431 in the m_probability of state, level `Exact match on state`
Iterat

In [388]:
linker.visualisations.match_weights_chart()

In [389]:
# company_name doesn't look good here
linker.visualisations.m_u_parameters_chart()

In [285]:
settings = linker.misc.save_model_to_json(
    "model_test.json", overwrite=True
)

## Make Predictions

In [390]:
df_predictions = linker.inference.predict(threshold_match_probability=0.5)

Blocking time: 0.28 seconds


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Predict time: 3.06 seconds


In [391]:
preds_df = df_predictions.as_pandas_dataframe()

In [392]:
preds_df.sort_values(by="match_probability")

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_l,company_name_r,gamma_company_name,bf_company_name,street_address_l,street_address_r,gamma_street_address,bf_street_address,zip_code_l,zip_code_r,gamma_zip_code,tf_zip_code_l,tf_zip_code_r,bf_zip_code,bf_tf_adj_zip_code,state_l,state_r,gamma_state,tf_state_l,tf_state_r,bf_state,bf_tf_adj_state,city_l,city_r,gamma_city,tf_city_l,tf_city_r,bf_city,bf_tf_adj_city,company_name_mphone_l,company_name_mphone_r,report_year_l,report_year_r,match_key
11211,0.054332,0.509414,__splink__input_table_0,__splink__input_table_1,85762,68295,citi trends incorporated,georgia pacific corporation,1,1.462842,104 coleman boulevard,,-1,1.000000,31408,31326,0,0.000045,0.000103,0.402918,1.000000,ga,ga,1,0.023374,0.023374,22.598054,1.815434,savannah,savannah,2,0.000454,0.000454,215.559681,9.129471,ST TRNTS INKRPRTT,JRJ PSFK KRPRXN,2021,2008,0
11666,0.098035,0.516982,__splink__input_table_0,__splink__input_table_1,94615,75114,"chicopee bancorp, incorporated",chicopee city of,0,0.845800,70 center street,725 front street,0,0.844089,01013,01021,0,0.000036,0.000061,0.402918,1.000000,ma,ma,1,0.042950,0.042950,22.598054,0.987961,chicopee,chicopee,2,0.000117,0.000117,215.559681,35.431042,XKP BNKRP INKRPRTT,XKP ST OF,2012,2012,0
11665,0.098035,0.516982,__splink__input_table_0,__splink__input_table_1,94614,75115,"chicopee bancorp, incorporated",chicopee city of,0,0.845800,70 center street,725 front street,0,0.844089,01013,01021,0,0.000036,0.000061,0.402918,1.000000,ma,ma,1,0.042950,0.042950,22.598054,0.987961,chicopee,chicopee,2,0.000117,0.000117,215.559681,35.431042,XKP BNKRP INKRPRTT,XKP ST OF,2011,2011,0
11668,0.098035,0.516982,__splink__input_table_0,__splink__input_table_1,94618,75118,"chicopee bancorp, incorporated",chicopee city of,0,0.845800,70 center street,725 front street,0,0.844089,01013,01021,0,0.000036,0.000061,0.402918,1.000000,ma,ma,1,0.042950,0.042950,22.598054,0.987961,chicopee,chicopee,2,0.000117,0.000117,215.559681,35.431042,XKP BNKRP INKRPRTT,XKP ST OF,2008,2008,0
11669,0.098035,0.516982,__splink__input_table_0,__splink__input_table_1,94620,75116,"chicopee bancorp, incorporated",chicopee city of,0,0.845800,70 center street,p o box 405,0,0.844089,01013,01021,0,0.000036,0.000061,0.402918,1.000000,ma,ma,1,0.042950,0.042950,22.598054,0.987961,chicopee,chicopee,2,0.000117,0.000117,215.559681,35.431042,XKP BNKRP INKRPRTT,XKP ST OF,2010,2010,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10043,45.026591,1.000000,__splink__input_table_0,__splink__input_table_1,177698,67483,green mountain power corporation,green mountain power corporation,3,9751.372250,163 acorn lane,163 acorn lane,3,24859.333063,05446,05446,1,0.000143,0.000143,1447.988342,2.894003,vt,vt,1,0.002680,0.002680,22.598054,15.835981,colchester,colchester,2,0.000198,0.000198,215.559681,20.959208,KRN MNTN PWR KRPRXN,KRN MNTN PWR KRPRXN,2001,2001,0
10051,45.026591,1.000000,__splink__input_table_0,__splink__input_table_1,177702,67479,green mountain power corporation,green mountain power corporation,3,9751.372250,163 acorn lane,163 acorn lane,3,24859.333063,05446,05446,1,0.000143,0.000143,1447.988342,2.894003,vt,vt,1,0.002680,0.002680,22.598054,15.835981,colchester,colchester,2,0.000198,0.000198,215.559681,20.959208,KRN MNTN PWR KRPRXN,KRN MNTN PWR KRPRXN,2005,2005,0
10050,45.026591,1.000000,__splink__input_table_0,__splink__input_table_1,177701,67480,green mountain power corporation,green mountain power corporation,3,9751.372250,163 acorn lane,163 acorn lane,3,24859.333063,05446,05446,1,0.000143,0.000143,1447.988342,2.894003,vt,vt,1,0.002680,0.002680,22.598054,15.835981,colchester,colchester,2,0.000198,0.000198,215.559681,20.959208,KRN MNTN PWR KRPRXN,KRN MNTN PWR KRPRXN,2004,2004,0
10049,45.026591,1.000000,__splink__input_table_0,__splink__input_table_1,177699,67482,green mountain power corporation,green mountain power corporation,3,9751.372250,163 acorn lane,163 acorn lane,3,24859.333063,05446,05446,1,0.000143,0.000143,1447.988342,2.894003,vt,vt,1,0.002680,0.002680,22.598054,15.835981,colchester,colchester,2,0.000198,0.000198,215.559681,20.959208,KRN MNTN PWR KRPRXN,KRN MNTN PWR KRPRXN,2002,2002,0
