In [1]:
%load_ext autoreload
%autoreload 3

In [260]:
import numpy as np
import pandas as pd
from upath import UPath

from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename
from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df, add_sec_company_id_to_subsidiaries

[autoreload of mozilla_sec_eia.models.sec10k.utils.cloud failed: Traceback (most recent call last):
  File "/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 274, in check
    superreload(m, reload, self.old_objects, self.shell)
  File "/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 335, in update_class
    if (old_obj == new_obj) is True:
        ^^^^^^^^^^^^^^^^^^
  File "/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py", line 834, in __eq__
    return dict(self.items())

# Read in Inputs

In [3]:
# for now try just training on 2023
raw_sec_df = pd.concat([pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet"),
                        pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet"),
                        pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet"),
                        pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet"),
                       ]
                      )

In [4]:
raw_sec_df = raw_sec_df.reset_index().pivot_table(values="value", index="filename", columns="key", aggfunc="first")
raw_sec_df.columns.name = None

In [5]:
ex21_path = UPath("gs://sec10k-outputs/v2/ex21_company_ownership_info")

In [6]:
raw_ex21_df = pd.DataFrame()
for file in ex21_path.iterdir():
    if file.name.split(".")[-1] == "parquet":
        report_year = file.name[:4]
        # for now just train with 2023
        if report_year != "2023":
            continue
        year_quarter_df = pd.read_parquet(ex21_path / file.name)
        year_quarter_df.loc[:, "report_year"] = report_year
        year_quarter_df.loc[:, "report_year"] = pd.to_datetime(year_quarter_df["report_year"], format="%Y").dt.year
        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])

# Preprocessing

In [None]:
sec_df = prepare_sec10k_basic_info_df(raw_sec_df)

In [157]:
ex21_df = prepare_ex21_df(raw_ex21_df)

  )


In [69]:
ex21_df.record_id.is_unique

True

In [70]:
sec_df.record_id.is_unique

True

In [None]:
# sec_df has filename as unique ID
sec_df.filename.is_unique

Note: not removing paragraph layout docs, but maybe should

# Try to just match on cleaned name and location

In [170]:
sec_match_df = sec_df.drop_duplicates(subset=["central_index_key", "company_name", "loc_of_incorporation"])

In [179]:
merged_df = sec_match_df.merge(ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21"))

In [185]:
merged_df["loc_of_incorporation_sec"]

0       florida
1      delaware
2      missouri
3      delaware
4           NaN
         ...   
515    delaware
516    delaware
517    delaware
518    delaware
519    delaware
Name: loc_of_incorporation_sec, Length: 520, dtype: object

In [209]:
merged_df.loc[:, "loc_tokens_sec"] = merged_df["loc_of_incorporation_sec"].fillna("").str.lower().str.split()
merged_df.loc[:, "loc_tokens_ex21"] = merged_df["loc_of_incorporation_ex21"].fillna("").str.lower().str.split()
merged_df["loc_overlap"] = merged_df.apply(
    lambda row: len(set(row["loc_tokens_sec"]) & set(row["loc_tokens_ex21"])), axis=1
)

# Select the row with the highest word overlap for each CIK and company name
closest_match = merged_df.loc[merged_df.groupby(["central_index_key", "company_name"])['loc_overlap'].idxmax()].reset_index(drop=True)

In [210]:
# this should be 0
closest_match.duplicated(subset=["company_name", "loc_of_incorporation_ex21"]).value_counts()

False    480
Name: count, dtype: int64

In [200]:
# it's okay if there's duplication here, but not ideal
# multiple subsidiaries can point to the same CIK
closest_match.central_index_key.duplicated().value_counts()

central_index_key
False    480
Name: count, dtype: int64

In [201]:
closest_match.head(2)

Unnamed: 0,record_id_sec,filename,phone_number,central_index_key,city,company_name_raw_sec,date_of_name_change,film_number,fiscal_year_end,form_type,former_conformed_name,irs_number,organization_name,sec_act,sec_file_number,standard_industrial_classification,state,state_of_incorporation,street_address,street_address_2,zip_code,report_date,report_year_sec,loc_of_incorporation_sec,company_name,company_name_no_legal_sec,company_name_mphone_sec,record_id_ex21,id,company_name_raw_ex21,loc_of_incorporation_ex21,own_per,report_year_ex21,company_name_no_legal_ex21,company_name_mphone_ex21,loc_tokens_sec,loc_tokens_ex21,loc_overlap
0,7990,edgar/data/910638/0000910638-23-000009.txt,8033263900,910638,rock hill,3d systems corp,19930816,23738595,1231,10-k,3 d systems corp,954431352,,1934 act,001-34220,services-prepackaged software [7372],sc,de,333 three d systems circle,,29730,2023-03-16,2023,delaware,3d systems corporation,3d systems,T SSTMS,150739,910638-0000910638-23-000009,3d systems corporation,delaware,,2023,3d systems,T SSTMS,[delaware],[delaware],1
1,7526,edgar/data/824142/0000824142-23-000019.txt,9185832266,824142,tulsa,"aaon, inc.",19920703,23675207,1231,10-k,aaon inc,870448736,,1934 act,000-18953,air cond & warm air heating equip & comm & ind...,ok,nv,2425 south yukon ave.,,74107,2023-02-27,2023,nevada,aaon incorporated,aaon,N,142821,824142-0000824142-23-000019,"aaon, inc",oklahoma,,2023,aaon,N,[nevada],[oklahoma],0


In [241]:
ex21_with_cik = ex21_df.merge(
    closest_match[["company_name", "central_index_key", "loc_of_incorporation_ex21"]].rename(columns={"loc_of_incorporation_ex21": "loc_of_incorporation"}),
    how="left",
    on=["company_name", "loc_of_incorporation"],
).rename(columns={"central_index_key": "subsidiary_cik"})

In [242]:
ex21_with_cik = ex21_with_cik.merge(closest_match[["company_name", "central_index_key"]],
                                    how="left",
                                    on="company_name"
                                   ).rename(columns={"central_index_key": "company_name_merge_cik"})

In [243]:
ex21_with_cik.subsidiary_cik.isnull().value_counts()

subsidiary_cik
True     191387
False       480
Name: count, dtype: int64

In [244]:
# if a subsidiary doesn't have a CIK and has a null location
# but its name was assigned a CIK (with a different location)
# then assign that CIK to the subsidiary
ex21_with_cik["subsidiary_cik"] = ex21_with_cik["subsidiary_cik"].where(
    ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), 
    ex21_with_cik["company_name_merge_cik"]
)

In [245]:
ex21_with_cik.subsidiary_cik.isnull().value_counts()

subsidiary_cik
True     191386
False       481
Name: count, dtype: int64

In [252]:
archive = GCSArchive()
md = archive.get_metadata()

In [261]:
ex21_with_cik.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_with_cik)

In [263]:
ex21_with_cik = ex21_with_cik.merge(md["cik"], how="left", left_on="filename", right_index=True).rename(columns={"cik": "parent_cik"})

In [264]:
ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)

In [265]:
ex21_with_cik

Unnamed: 0,record_id,id,company_name_raw,loc_of_incorporation,own_per,report_year,company_name,company_name_no_legal,company_name_mphone,subsidiary_cik,company_name_merge_cik,filename,parent_cik,sec_company_id
164482,164482,1000045-0000950170-23-030037,"nicholas data services, inc",florida,100.0,2023,nicholas data services incorporated,nicholas data services,NXLS TT SRFSS,,,edgar/data/1000045/0000950170-23-030037.txt,1000045,1000045_1
164481,164481,1000045-0000950170-23-030037,"nicholas financial, inc",florida,100.0,2023,nicholas financial incorporated,nicholas financial,NXLS FNNXL,0001000045,0001000045,edgar/data/1000045/0000950170-23-030037.txt,1000045,0001000045
89,89,1000209-0000950170-23-007273,medallion bank,utah,,2023,medallion bank,medallion bank,MTLN BNK,,,edgar/data/1000209/0000950170-23-007273.txt,1000209,1000209_1
88,88,1000209-0000950170-23-007273,freshstart venture capital corp,new york,,2023,freshstart venture capital corporation,freshstart venture capital,FRXSTRT FNTR KPTL,,,edgar/data/1000209/0000950170-23-007273.txt,1000209,1000209_2
87,87,1000209-0000950170-23-007273,"medallion capital, inc",minnesota,,2023,medallion capital incorporated,medallion capital,MTLN KPTL,,,edgar/data/1000209/0000950170-23-007273.txt,1000209,1000209_3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161957,161957,9984-0000009984-23-000060,barnes molding solutions korea limited,korea,,2023,barnes molding solutions korea limited,barnes molding solutions korea,BRNS MLTNK SLXNS KR,,,edgar/data/9984/0000009984-23-000060.txt,9984,9984_99
161956,161956,9984-0000009984-23-000060,"barnes molding solutions (jiangsu) co., ltd",china,,2023,barnes molding solutions company limited,barnes molding solutions,BRNS MLTNK SLXNS,,,edgar/data/9984/0000009984-23-000060.txt,9984,9984_100
161955,161955,9984-0000009984-23-000060,barnes korea ltd,korea,,2023,barnes korea limited,barnes korea,BRNS KR,,,edgar/data/9984/0000009984-23-000060.txt,9984,9984_101
161965,161965,9984-0000009984-23-000060,gimatic automation india pvt ltd,india,,2023,gimatic automation india pvt limited,gimatic automation india pvt,JMTK ATMXN INT PFT,,,edgar/data/9984/0000009984-23-000060.txt,9984,9984_102


In [266]:
ex21_with_cik.to_parquet("ex21_2023.parquet")

# Match Ex. 21 Subsidiaries to a SEC filer

## Preprocessing

In [85]:
sec_df[(sec_df["state"] != sec_df["state_of_incorporation"]) & (~sec_df["state_of_incorporation"].isnull())][["state", "state_of_incorporation"]]

Unnamed: 0,state,state_of_incorporation
1,ny,de
2,ny,de
5,ca,md
6,ga,de
7,nj,de
...,...,...
8265,ny,de
8266,tx,de
8267,ny,oh
8268,tx,de


In [24]:
sec_match_df = sec_df.copy()

In [25]:
sec_match_df["loc_of_incorporation"].isnull().value_counts()

loc_of_incorporation
False    6382
True      749
Name: count, dtype: int64

In [26]:
ex21_match_df = ex21_df.copy()

Remove clearly "invalid" strings and fill nulls

In [71]:
ex21_match_df.company_name.value_counts().head(20)

company_name
rush truck center                                          120
encompass health rehabilitation hospital                    79
rush peterbilt truck center                                 57
branch                                                      52
sci funeral services llc iowa limited liability company     33
partnership limited partnership                             32
alderwoods group llc de limited liability company           27
encompass health rehabilitation hospital of                 26
u haul co of                                                26
at and t                                                    25
corporation                                                 21
amh portfolio management                                    20
rush bus center                                             20
limited partnership limited partnership                     18
therapy limited partnership                                 15
rush isuzu trucks                         

In [72]:
ex21_match_df = ex21_match_df[["record_id", "report_year", "company_name", "loc_of_incorporation", "company_name_mphone"]]
sec_match_df = sec_match_df[["record_id", "report_year", "company_name", "loc_of_incorporation", "company_name_mphone"]]

In [73]:
# TEMP
sec_match_df.loc[:, "company_name_mphone_list"] = sec_match_df["company_name_mphone"].str.split(" ")
ex21_match_df.loc[:, "company_name_mphone_list"] = ex21_match_df["company_name_mphone"].str.split(" ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sec_match_df.loc[:, "company_name_mphone_list"] = sec_match_df["company_name_mphone"].str.split(" ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ex21_match_df.loc[:, "company_name_mphone_list"] = ex21_match_df["company_name_mphone"].str.split(" ")


In [74]:
sec_match_df.loc[:, "loc_list"] = sec_match_df["loc_of_incorporation"].str.replace(",", '').str.split(" ")
ex21_match_df.loc[:, "loc_list"] = ex21_match_df["loc_of_incorporation"].str.replace(",", '').str.split(" ")

## Exploratory Analysis

In [75]:
from splink.exploratory import completeness_chart, profile_columns
from splink import DuckDBAPI

db_api = DuckDBAPI()

In [76]:
match_cols = ["report_year", "company_name", "loc_of_incorporation"]

In [77]:
# sometimes this will show up as 100% complete in loc_of_incorporation, not sure why
completeness_chart([ex21_match_df[match_cols], sec_match_df[match_cols]], db_api=db_api)

There is strong skew in the location of incorporation field with around 40-50% of the values being Delaware in both datasets. We therefore want to use `term_frequency_adjustments` in our linkage model.

In [78]:
profile_columns(ex21_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

In [79]:
profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

## Blocking

Can maybe do the subsidiary to filers match without blocking but probably want a blocking rule. 

TODO: can we block on nearest 5 report years instead of exact match report year?

In [36]:
from splink import block_on
from splink.blocking_analysis import count_comparisons_from_blocking_rule, n_largest_blocks

In [80]:
br = "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)"

In [81]:
# br0 = block_on("report_year", "report_year")
# br1 = "jaccard(l.company_name, r.company_name) < .1"
# br2 = block_on("company_name", "company_name")

counts = count_comparisons_from_blocking_rule(
    table_or_tables=[sec_match_df, ex21_match_df],
    blocking_rule=br,
    link_type="link_only",
    unique_id_column_name='record_id',
    db_api=db_api,
)

counts

{'number_of_comparisons_generated_pre_filter_conditions': 531298,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 531298,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',
 'link_type_join_condition': 'where l."source_dataset" || \'-__-\' || l."record_id" < r."source_dataset" || \'-__-\' || r."record_id" and l."source_dataset" != r."source_dataset"'}

In [82]:
result = n_largest_blocks(
    table_or_tables=[sec_match_df, ex21_match_df],
    blocking_rule=br,
    link_type="link_only",
    db_api=db_api,
    n_largest=3
)

result.as_pandas_dataframe()

Unnamed: 0,key_0,count_l,count_r,block_count
0,AMRK,56,625,35000
1,FRST,56,555,31080
2,INTR,30,659,19770


In [84]:
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

blocking_rules_for_analysis = [
    br
]


cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[sec_match_df, ex21_match_df],
    blocking_rules=blocking_rules_for_analysis,
    db_api=db_api,
    unique_id_column_name='record_id',
    link_type="link_only",
)

## Create Model

Maybe want to deduplicate the Ex. 21 data first, then conduct a link to SEC filers?

In [44]:
import splink.comparison_library as cl
from splink import Linker, SettingsCreator

In [120]:
company_name_comparison = cl.ExactMatch(
    "company_name",
)
print(company_name_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'ExactMatch' of "company_name".
Similarity is assessed using the following ComparisonLevels:
    - 'company_name is NULL' with SQL rule: "company_name_l" IS NULL OR "company_name_r" IS NULL
    - 'Exact match on company_name' with SQL rule: "company_name_l" = "company_name_r"
    - 'All other comparisons' with SQL rule: ELSE



In [85]:
company_name_comparison = cl.LevenshteinAtThresholds(
    "company_name",
    distance_threshold_or_thresholds=[1]
)
print(company_name_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'LevenshteinAtThresholds' of "company_name".
Similarity is assessed using the following ComparisonLevels:
    - 'company_name is NULL' with SQL rule: "company_name_l" IS NULL OR "company_name_r" IS NULL
    - 'Exact match on company_name' with SQL rule: "company_name_l" = "company_name_r"
    - 'Levenshtein distance of company_name <= 1' with SQL rule: levenshtein("company_name_l", "company_name_r") <= 1
    - 'All other comparisons' with SQL rule: ELSE



In [81]:
company_name_comparison = cl.ArrayIntersectAtSizes(
    "company_name_mphone_list",
    size_threshold_or_thresholds=[3,2,1]
)
print(company_name_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'ArrayIntersectAtSizes' of "company_name_mphone_list".
Similarity is assessed using the following ComparisonLevels:
    - 'company_name_mphone_list is NULL' with SQL rule: "company_name_mphone_list_l" IS NULL OR "company_name_mphone_list_r" IS NULL
    - 'Array intersection size >= 3' with SQL rule: array_length(list_intersect("company_name_mphone_list_l", "company_name_mphone_list_r")) >= 3
    - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect("company_name_mphone_list_l", "company_name_mphone_list_r")) >= 2
    - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect("company_name_mphone_list_l", "company_name_mphone_list_r")) >= 1
    - 'All other comparisons' with SQL rule: ELSE



In [122]:
# try with Levenshtein too
location_comparison = cl.JaroWinklerAtThresholds(
    "loc_of_incorporation",
    score_threshold_or_thresholds=[0.9]
)
print(location_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'JaroWinklerAtThresholds' of "loc_of_incorporation".
Similarity is assessed using the following ComparisonLevels:
    - 'loc_of_incorporation is NULL' with SQL rule: "loc_of_incorporation_l" IS NULL OR "loc_of_incorporation_r" IS NULL
    - 'Exact match on loc_of_incorporation' with SQL rule: "loc_of_incorporation_l" = "loc_of_incorporation_r"
    - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity("loc_of_incorporation_l", "loc_of_incorporation_r") >= 0.9
    - 'All other comparisons' with SQL rule: ELSE



In [121]:
location_comparison = cl.ArrayIntersectAtSizes(
    "loc_list",
    size_threshold_or_thresholds=[2,1]
)
print(location_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'ArrayIntersectAtSizes' of "loc_list".
Similarity is assessed using the following ComparisonLevels:
    - 'loc_list is NULL' with SQL rule: "loc_list_l" IS NULL OR "loc_list_r" IS NULL
    - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect("loc_list_l", "loc_list_r")) >= 2
    - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect("loc_list_l", "loc_list_r")) >= 1
    - 'All other comparisons' with SQL rule: ELSE



In [123]:
settings = SettingsCreator(
    link_type="link_only",
    unique_id_column_name="record_id",
    comparisons=[
        company_name_comparison,
        location_comparison.configure(term_frequency_adjustments=True)
    ],
    blocking_rules_to_generate_predictions=[
        br
    ],
    retain_intermediate_calculation_columns=True,
)

linker = Linker([sec_match_df, ex21_match_df], settings, db_api=DuckDBAPI())

Estimate probability two random records match

In [124]:
deterministic_rules = [
    block_on("company_name_mphone", "company_name_mphone"),
    "jaccard(r.company_name, l.company_name) >= .95 and l.loc_of_incorporation = r.loc_of_incorporation",
    "substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .95",
    # "substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation"
]

linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Probability two random records match is estimated to be  8.21e-05.
This means that amongst all possible pairwise record comparisons, one in 12,184.39 are expected to match.  With 1,368,717,009 total possible comparisons, we expect a total of around 112,333.68 matching pairs


In [125]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e7)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - company_name (no m values are trained).
    - loc_of_incorporation (no m values are trained).


In [126]:
training_blocking_rule = block_on("company_name_mphone", "company_name_mphone")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."company_name_mphone" = r."company_name_mphone") AND (l."company_name_mphone" = r."company_name_mphone")

Parameter estimates will be made for the following comparison(s):
    - company_name
    - loc_of_incorporation

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.38 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`
Iteration 2: Largest change in params was 0.027 in the m_probability of loc_of_incorporation, level `All other comparisons`
Iteration 3: Largest change in params was -0.000274 in the m_probability of company_name, level `Exact match on company_name`
Iteration 4: Largest change in params was -0.00056 in the m_probability of company_name, level `Exact match on company_name`
Iteration 5: Largest change in params was 0.00112 

In [127]:
linker.visualisations.match_weights_chart()

In [128]:
linker.visualisations.m_u_parameters_chart()

In [107]:
settings = linker.misc.save_model_to_json(
    "../sec_ex21_model_settings/2023_model.json", overwrite=True
)

Log model in MLFlow.

## Make predictions

In [129]:
df_predictions = linker.inference.predict(threshold_match_probability=0.5)

Blocking time: 0.20 seconds
Predict time: 0.12 seconds


In [130]:
preds_df = df_predictions.as_pandas_dataframe()

In [131]:
preds_df = preds_df.merge(sec_df[["record_id", "company_name_raw"]], how="left", left_on="record_id_l", right_on="record_id").rename(columns={"company_name_raw": "company_name_sec"})

In [132]:
preds_df = preds_df.merge(ex21_df[["record_id", "company_name_raw"]], how="left", left_on="record_id_r", right_on="record_id").rename(columns={"company_name_raw": "company_name_ex21"})

In [133]:
preds_df.sort_values(by="match_probability").iloc[0:50]

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_l,company_name_r,gamma_company_name,bf_company_name,loc_of_incorporation_l,loc_of_incorporation_r,gamma_loc_of_incorporation,tf_loc_of_incorporation_l,tf_loc_of_incorporation_r,bf_loc_of_incorporation,bf_tf_adj_loc_of_incorporation,company_name_mphone_l,company_name_mphone_r,record_id_x,company_name_sec,record_id_y,company_name_ex21
0,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,8180,159390,national instruments corporation,national instruments corporation,1,2492261.0,delaware,republic of korea,0,0.372842,0.000234,0.551065,1.0,NXNL INSTRMNTS,NXNL INSTRMNTS,8180,national instruments corp,159390,national instruments (korea) corporation
176,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,6034,107265,afternext healthtech acquisition corporation,afternext healthtech acquisition corporation,1,2492261.0,e9,cayman islands,0,0.001069,0.015387,0.551065,1.0,AFTRNKST HL0TX AKKSXN,AFTRNKST HL0TX AKKSXN,6034,afternext healthtech acquisition corp.,107265,afternext healthtech acquisition corp
178,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,6799,117610,gap incorporated,gap incorporated,1,2492261.0,delaware,puerto rico,0,0.372842,0.001548,0.551065,1.0,KP,KP,6799,gap inc,117610,"gap (puerto rico), inc"
183,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,5811,170135,rockley photonics holdings limited,rockley photonics holdings limited,1,2492261.0,e9,cayman islands,0,0.001069,0.015387,0.551065,1.0,RKL FTNKS HLTNKS,RKL FTNKS HLTNKS,5811,rockley photonics holdings ltd,170135,rockley photonics holdings limited
184,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,6799,117608,gap incorporated,gap incorporated,1,2492261.0,delaware,california,0,0.372842,0.015978,0.551065,1.0,KP,KP,6799,gap inc,117608,gap (itm) inc
186,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,6799,117605,gap incorporated,gap incorporated,1,2492261.0,delaware,canada,0,0.372842,0.012191,0.551065,1.0,KP,KP,6799,gap inc,117605,gap (canada) inc
412,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,1524,165843,aircastle limited,aircastle limited,1,2492261.0,d0,ireland,0,0.00015,0.008315,0.551065,1.0,ARKSTL,ARKSTL,1524,aircastle ltd,165843,aircastle (ireland) limited
189,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,6753,115383,arthur j gallagher and company,arthur j gallagher and company,1,2492261.0,illinois,delaware,0,0.006115,0.372842,0.551065,1.0,AR0R J KLKHR ANT,AR0R J KLKHR ANT,6753,arthur j. gallagher & co.,115383,arthur j. gallagher & co
193,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,6651,110797,flowserve corporation,flowserve corporation,1,2492261.0,new york,mauritius,0,0.009913,0.001075,0.551065,1.0,FLSRF,FLSRF,6651,flowserve corp,110797,flowserve (mauritius) corporation
406,6.816691,0.991207,__splink__input_table_0,__splink__input_table_1,578,24844,united parcel service incorporated,united parcel service incorporated,1,2492261.0,delaware,ohio,0,0.372842,0.008136,0.551065,1.0,UNTT PRSL SRFS,UNTT PRSL SRFS,578,united parcel service inc,24844,"united parcel service, inc"


In [109]:
preds_df[preds_df.match_probability > .9]

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_l,company_name_r,gamma_company_name,bf_company_name,loc_of_incorporation_l,loc_of_incorporation_r,gamma_loc_of_incorporation,tf_loc_of_incorporation_l,tf_loc_of_incorporation_r,bf_loc_of_incorporation,bf_tf_adj_loc_of_incorporation,company_name_mphone_l,company_name_mphone_r
0,6.339909,0.987805,__splink__input_table_0,__splink__input_table_1,8180,159390,national instruments corporation,national instruments corporation,2,1774257.0,delaware,republic of korea,0,0.372842,0.000234,0.55623,1.0,NXNL INSTRMNTS,NXNL INSTRMNTS
1,6.339909,0.987805,__splink__input_table_0,__splink__input_table_1,7912,154757,enbridge incorporated,enbridge incorporated,2,1774257.0,a0,alberta,0,3.3e-05,0.00088,0.55623,1.0,ENBRJ,ENBRJ
2,6.339909,0.987805,__splink__input_table_0,__splink__input_table_1,7557,140921,spectrum pharmaceuticals incorporated,spectrum pharmaceuticals incorporated,2,1774257.0,delaware,cayman islands,0,0.372842,0.015387,0.55623,1.0,SPKTRM FRMSTKLS,SPKTRM FRMSTKLS
3,7.717639,0.995272,__splink__input_table_0,__splink__input_table_1,8057,152329,american eagle outfitters incorporated,american eagle outfitters incorporated,2,1774257.0,delaware,delaware,2,0.372842,0.372842,2.487467,0.581079,AMRKN EKL OTFTRS,AMRKN EKL OTFTRS
4,14.126362,0.999944,__splink__input_table_0,__splink__input_table_1,7315,28974,pruco life insurance company,pruco life insurance company,2,1774257.0,arizona,arizona,2,0.004388,0.004388,2.487467,49.36883,PRK LF INSRNS,PRK LF INSRNS
5,7.186156,0.99318,__splink__input_table_0,__splink__input_table_1,7419,142779,national presto industries incorporated,national presto industries incorporated,2,1774257.0,wisconsin,,-1,0.00411,,1.0,1.0,NXNL PRST INTSTRS,NXNL PRST INTSTRS
6,6.339909,0.987805,__splink__input_table_0,__splink__input_table_1,7387,142016,national bankshares incorporated,national bankshares incorporated,2,1774257.0,virginia,commonwealth virginia,0,0.006276,2.2e-05,0.55623,1.0,NXNL BNKXRS,NXNL BNKXRS
7,13.610142,0.99992,__splink__input_table_0,__splink__input_table_1,7387,127697,national bankshares incorporated,national bankshares incorporated,2,1774257.0,virginia,virginia,2,0.006276,0.006276,2.487467,34.518756,NXNL BNKXRS,NXNL BNKXRS
8,7.717639,0.995272,__splink__input_table_0,__splink__input_table_1,8258,162906,thermo fisher scientific incorporated,thermo fisher scientific incorporated,2,1774257.0,delaware,delaware,2,0.372842,0.372842,2.487467,0.581079,0RM FXR SSNTFK,0RM FXR SSNTFK
9,12.101855,0.999773,__splink__input_table_0,__splink__input_table_1,7428,60197,general motors financial company incorporated,general motors financial company incorporated,2,1774257.0,texas,texas,2,0.017854,0.017854,2.487467,12.134323,JNRL MTRS FNNXL,JNRL MTRS FNNXL


In [79]:
preds_df[preds_df.match_probability >= .9][["match_probability", "company_name_l", "company_name_r", "loc_list_l", "loc_list_r", "company_name_mphone_l", "company_name_mphone_r"]].iloc[150:200]

Unnamed: 0,match_probability,company_name_l,company_name_r,loc_list_l,loc_list_r,company_name_mphone_l,company_name_mphone_r
465,0.914612,conns incorporated,invenco incorporated,[delaware],[delaware],KNS,INFNK
466,0.914612,vishay intertechnology incorporated,"vishay precision foil, incorporated",[delaware],[delaware],FX INTRTXNLJ,FX PRSXN FL
467,0.980607,"vishay precision group, incorporated","vishay precision foil, incorporated",[delaware],[delaware],FX PRSXN KRP,FX PRSXN FL
470,0.975104,jones lang lasalle incorporated,jones lang lasalle limited,[maryland],"[hong, kong]",JNS LNK LSL,JNS LNK LSL
471,0.951657,"nrg energy, incorporated","nrg energy, incorporated",[delaware],[delaware],NRK ENRJ,NRK ENRJ
472,0.914612,firstenergy corporation,firstenergy ventures corporation,[ohio],[ohio],FRSTNRJ,FRSTNRJ FNTRS
478,0.914612,"hudson pacific properties, incorporated","hudson pacific services, incorporated",[maryland],[maryland],HTSN PSFK PRPRTS,HTSN PSFK SRFSS
479,0.980607,"hudson pacific properties, incorporated","hudson pacific properties, limited partnership",[maryland],[maryland],HTSN PSFK PRPRTS,HTSN PSFK PRPRTS
481,0.914612,"digital ally, incorporated","digital ally international, incorporated",[nevada],[nevada],TJTL AL,TJTL AL INTRNXNL
489,0.976947,cco holdings limited liability company,"rhfw holdings, limited liability company",,[delaware],KK HLTNKS,RHF HLTNKS
