In [1]:
%load_ext autoreload
%autoreload 3

In [27]:
import numpy as np
import pandas as pd
from upath import UPath

# from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
# from pudl.analysis.record_linkage import name_cleaner
from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df

# Read in Inputs

In [3]:
raw_eia_df = pd.read_parquet("s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet")

In [4]:
eia_df = raw_eia_df.copy()

In [5]:
mergers_df = pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet")

In [6]:
raw_eia861_df = pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet")

In [7]:
missing_utils = raw_eia861_df[~raw_eia861_df.utility_id_eia.isin(raw_eia_df.utility_id_eia.unique())].utility_id_eia.unique()

In [8]:
harvested_df = pd.concat([
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
    pd.read_parquet("s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet")[["report_date", "utility_id_eia", "utility_name_eia"]],
])

In [9]:
eia861_df = raw_eia861_df.merge(harvested_df, on=["report_date", "utility_id_eia"], how="left").drop_duplicates(subset=["report_date", "utility_id_eia"])

In [10]:
mergers_df = mergers_df[mergers_df["new_parent"].notna()]
eia861_df = eia861_df.merge(mergers_df[["report_date", "new_parent", "merge_address", "merge_city", "merge_state"]], 
                how="left", 
                left_on=["report_date", "utility_name_eia"],
                right_on=["report_date", "new_parent"]
               )
eia861_df = eia861_df.rename(columns={"merge_address": "street_address", "merge_city": "city"})
eia861_df = eia861_df.groupby(["report_date", "utility_id_eia"]).first().reset_index()

In [11]:
eia861_df[(eia861_df.state != eia861_df.merge_state) & (eia861_df.merge_state.notna())]

Unnamed: 0,report_date,utility_id_eia,state,utility_name_eia,new_parent,street_address,city,merge_state
29933,2009-01-01,17698,LA,Southwestern Electric Power Co,Southwestern Electric Power Co,1 Riverside Plaza,Columbus,OH
33258,2010-01-01,17698,AR,Southwestern Electric Power Co,Southwestern Electric Power Co,1 Riverside Plaza,Columbus,OH
49001,2015-01-01,11788,IA,Consumers Energy,Consumers Energy,One Enrgy Plaza,Jackson,MI
56853,2017-01-01,19157,IA,MiEnergy Cooperative,MiEnergy Cooperative,31110 Cooperative Way,Rushford,MN
70820,2021-01-01,40165,AZ,"Dixie Escalante R E A, Inc","Dixie Escalante R E A, Inc",495 N 3200 W,Flowell,UT


In [12]:
eia861_df["state"] = eia861_df["state"].where(eia861_df["merge_state"].isnull(), eia861_df["merge_state"])

In [13]:
eia861_df = eia861_df.drop(columns=["new_parent", "merge_state"])

In [14]:
eia_df = pd.concat([eia_df, eia861_df])

In [15]:
eia_df = eia_df.drop_duplicates(subset=["utility_id_eia", "report_date"], keep="first")

In [16]:
# not sure at what point this stops being a datetime
eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]")

In [17]:
# there are nulls from non harvested 861 utilities
eia_df = eia_df.dropna(subset="utility_name_eia")

In [32]:
# for now try just training on 2023
raw_sec_df = pd.concat([pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet"),
                        pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet"),
                        pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet"),
                        pd.read_parquet("gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet"),
                       ]
                      )

In [52]:
raw_sec_df = raw_sec_df.reset_index().pivot_table(values="value", index="filename", columns="key", aggfunc="first")
raw_sec_df.columns.name = None

In [24]:
ex21_path = UPath("gs://sec10k-outputs/v2/ex21_company_ownership_info")

In [35]:
raw_ex21_df = pd.DataFrame()
for file in ex21_path.iterdir():
    if file.name.split(".")[-1] == "parquet":
        report_year = file.name[:4]
        # for now just train with 2023
        if report_year != "2023":
            continue
        year_quarter_df = pd.read_parquet(ex21_path / file.name)
        year_quarter_df.loc[:, "report_year"] = report_year
        year_quarter_df.loc[:, "report_year"] = pd.to_datetime(year_quarter_df["report_year"], format="%Y").dt.year
        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])

In [25]:
eia_df[(eia_df.street_address.notnull())].head(3)

Unnamed: 0,utility_id_eia,utility_id_pudl,utility_name_eia,report_date,street_address,city,state,zip_code,plants_reported_owner,plants_reported_operator,...,contact_lastname,contact_title,phone_number,phone_extension,contact_firstname_2,contact_lastname_2,contact_title_2,phone_number_2,phone_extension_2,data_maturity
33,66292,16386.0,Desert Willow Energy Storage,2023-01-01,100 Bayview Circle,Newport Beach,CA,,,,...,,,,,,,,,,provisional
35,66291,16385.0,Portage Solar Plant,2023-01-01,N8917,Portage,WI,53901.0,,,...,,,,,,,,,,provisional
37,66290,16384.0,NSF Energy One LLC,2023-01-01,1241 University Ave,Rochester,NY,14607.0,,,...,,,,,,,,,,provisional


In [26]:
eia_subset = eia_df[eia_df.report_date == "2020-01-01"]

In [58]:
eia_df[(eia_df.utility_name_eia.str.contains("Duke Energy Corp")) & (eia_df.state == "NC")].drop_duplicates()

Unnamed: 0,utility_id_eia,utility_id_pudl,utility_name_eia,report_date,street_address,city,state,zip_code,plants_reported_owner,plants_reported_operator,...,contact_lastname,contact_title,phone_number,phone_extension,contact_firstname_2,contact_lastname_2,contact_title_2,phone_number_2,phone_extension_2,data_maturity
71566,5416,90.0,Duke Energy Corp,2010-01-01,P O Box 1006,Charlotte,NC,28202,,,...,,,,,,,,,,final
71568,5416,90.0,Duke Energy Corp,2008-01-01,,Charlotte,NC,28201,,,...,,,,,,,,,,final
71569,5416,90.0,Duke Energy Corp,2007-01-01,,Charlotte,NC,28201,,,...,Ashcraft,Sr. Engineering Technologist,,,Robert,Mc Murry,Dir Carolinas Integrated Resou,,,final
71570,5416,90.0,Duke Energy Corp,2006-01-01,,Charlotte,NC,28201,,,...,Duckworth,Planning Engineer,704-382-4327,382.0,Steven,Jester,"Director, Rate Admn & Cust Inq",704-382-4887,,final
71571,5416,90.0,Duke Energy Corp,2005-01-01,,Charlotte,NC,28201,,,...,Duckworth,Planning Engineer,704-382-4327,382.0,Steven,Jester,"Director, Rate Admn & Cust Inq",704-382-4887,,final
71572,5416,90.0,Duke Energy Corp,2004-01-01,,Charlotte,NC,28201,,,...,Duckworth,Planning Engineer,704-382-4327,0.0,Steven,Jester,"Director, Rate Admn & Cust Inq",704-382-4887,,final
71573,5416,90.0,Duke Energy Corp,2003-01-01,,Charlotte,NC,28201,,,...,Duckworth,Process Leader,,0.0,Steven,Jester,,,,final
71574,5416,90.0,Duke Energy Corp,2002-01-01,,Charlotte,NC,28201,,,...,Scott Henry,Process Leader,,0.0,,,Mgr Reg Policy $ Res,,,final
71575,5416,90.0,Duke Energy Corp,2001-01-01,,Charlotte,NC,28201,,,...,R S Henry,,,0.0,,,Mgr Operating Plann & Analysis,,,final


# Preprocessing

In [82]:
sec_df = prepare_sec10k_basic_info_df(raw_sec_df)

In [83]:
ex21_df = prepare_ex21_df(raw_ex21_df)

In [84]:
# sec_df has filename as unique ID
sec_df.filename.is_unique

True

Note: not removing paragraph layout docs, but maybe should

# Match Ex. 21 Subsidiaries to a SEC filer

## Preprocessing

In [85]:
sec_df[(sec_df["state"] != sec_df["state_of_incorporation"]) & (~sec_df["state_of_incorporation"].isnull())][["state", "state_of_incorporation"]]

Unnamed: 0,state,state_of_incorporation
1,ny,de
2,ny,de
5,ca,md
6,ga,de
7,nj,de
...,...,...
8265,ny,de
8266,tx,de
8267,ny,oh
8268,tx,de


In [86]:
sec_match_df = sec_df.copy()

In [87]:
sec_match_df["loc_of_incorporation"].isnull().value_counts()

loc_of_incorporation
False    6359
True      748
Name: count, dtype: int64

In [88]:
ex21_match_df = ex21_df.copy()

Remove clearly "invalid" strings and fill nulls

In [89]:
ex21_match_df.company_name.value_counts().head(20)

company_name
rush truck center                                           120
encompass health rehabilitation hospital                     79
rush peterbilt truck center                                  57
branch                                                       52
sci funeral services, llc iowa limited liability company     33
partnership limited partnership                              32
alderwoods group, llc de limited liability company           27
encompass health rehabilitation hospital of                  26
u haul co. of                                                26
at&t                                                         25
corporation                                                  21
amh portfolio management                                     20
rush bus center                                              20
limited partnership limited partnership                      18
rapy limited partnership                                     15
rush isuzu trucks          

In [90]:
ex21_match_df = ex21_match_df[["record_id", "report_year", "company_name", "loc_of_incorporation", "company_name_mphone"]]
sec_match_df = sec_match_df[["record_id", "report_year", "company_name", "loc_of_incorporation", "company_name_mphone"]]

## Exploratory Analysis

In [91]:
from splink.exploratory import completeness_chart, profile_columns
from splink import DuckDBAPI

db_api = DuckDBAPI()

In [102]:
match_cols = ["report_year", "company_name", "loc_of_incorporation"]

In [93]:
# sometimes this will show up as 100% non null in loc_of_incorporation, not sure why
completeness_chart(ex21_match_df[match_cols], db_api=db_api)

In [94]:
completeness_chart(sec_match_df[match_cols], db_api=db_api)

There is strong skew in the location of incorporation field with around 40-50% of the values being Delaware in both datasets. We therefore want to use `term_frequency_adjustments` in our linkage model.

In [95]:
profile_columns(ex21_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

In [96]:
profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)

## Blocking

Can maybe do the subsidiary to filers match without blocking but probably want a blocking rule. 

TODO: can we block on nearest 5 report years instead of exact match report year?

In [97]:
from splink import block_on
from splink.blocking_analysis import count_comparisons_from_blocking_rule, n_largest_blocks

In [103]:
br = "l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)"

In [104]:
# br0 = block_on("report_year", "report_year")
# br1 = "jaccard(l.company_name, r.company_name) < .1"
# br2 = block_on("company_name", "company_name")

counts = count_comparisons_from_blocking_rule(
    table_or_tables=[sec_match_df, ex21_match_df],
    blocking_rule=br,
    link_type="link_only",
    unique_id_column_name='record_id',
    db_api=db_api,
)

counts

{'number_of_comparisons_generated_pre_filter_conditions': 2069828,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 2069828,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 3) = SUBSTRING(r.company_name_mphone, 1, 3)',
 'link_type_join_condition': 'where l."source_dataset" || \'-__-\' || l."record_id" < r."source_dataset" || \'-__-\' || r."record_id" and l."source_dataset" != r."source_dataset"'}

In [106]:
result = n_largest_blocks(
    table_or_tables=[sec_match_df, ex21_match_df],
    blocking_rule=br,
    link_type="link_only",
    db_api=db_api,
    n_largest=3
)

result.as_pandas_dataframe()

Unnamed: 0,key_0,key_1,count_l,count_r,block_count
0,2023,STR,68,1297,88196
1,2023,INT,62,1275,79050
2,2023,KRN,60,1290,77400


In [107]:
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

blocking_rules_for_analysis = [
    # block_on("substr(l.company_name_mphone,1,3)", "substr(r.company_name_mphone,1,3)"),
    "l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)"
]


cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[sec_match_df, ex21_match_df],
    blocking_rules=blocking_rules_for_analysis,
    db_api=db_api,
    unique_id_column_name='record_id',
    link_type="link_only",
)

## Create Model

Maybe want to deduplicate the Ex. 21 data first, then conduct a link to SEC filers?

In [108]:
import splink.comparison_library as cl
from splink import Linker, SettingsCreator

In [109]:
company_name_comparison = cl.NameComparison(
    "company_name",
    # dmeta_col_name="company_name_mphone" # this was breaking it for some reason
)
print(company_name_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'NameComparison' of "company_name".
Similarity is assessed using the following ComparisonLevels:
    - 'company_name is NULL' with SQL rule: "company_name_l" IS NULL OR "company_name_r" IS NULL
    - 'Exact match on company_name' with SQL rule: "company_name_l" = "company_name_r"
    - 'Jaro-Winkler distance of company_name >= 0.92' with SQL rule: jaro_winkler_similarity("company_name_l", "company_name_r") >= 0.92
    - 'Jaro-Winkler distance of company_name >= 0.88' with SQL rule: jaro_winkler_similarity("company_name_l", "company_name_r") >= 0.88
    - 'Jaro-Winkler distance of company_name >= 0.7' with SQL rule: jaro_winkler_similarity("company_name_l", "company_name_r") >= 0.7
    - 'All other comparisons' with SQL rule: ELSE



In [110]:
# try with Levenshtein too
location_comparison = cl.JaroWinklerAtThresholds(
    "loc_of_incorporation",
)
print(location_comparison.get_comparison("duckdb").human_readable_description)

Comparison 'JaroWinklerAtThresholds' of "loc_of_incorporation".
Similarity is assessed using the following ComparisonLevels:
    - 'loc_of_incorporation is NULL' with SQL rule: "loc_of_incorporation_l" IS NULL OR "loc_of_incorporation_r" IS NULL
    - 'Exact match on loc_of_incorporation' with SQL rule: "loc_of_incorporation_l" = "loc_of_incorporation_r"
    - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity("loc_of_incorporation_l", "loc_of_incorporation_r") >= 0.9
    - 'Jaro-Winkler distance of loc_of_incorporation >= 0.7' with SQL rule: jaro_winkler_similarity("loc_of_incorporation_l", "loc_of_incorporation_r") >= 0.7
    - 'All other comparisons' with SQL rule: ELSE



In [111]:
settings = SettingsCreator(
    link_type="link_only",
    unique_id_column_name="record_id",
    comparisons=[
        company_name_comparison,
        location_comparison.configure(term_frequency_adjustments=True)
    ],
    blocking_rules_to_generate_predictions=[
        br
    ],
    retain_intermediate_calculation_columns=True,
)

linker = Linker([sec_match_df, ex21_match_df], settings, db_api=DuckDBAPI())

Estimate probability two random records match

In [112]:
deterministic_rules = [
    block_on("company_name_mphone", "company_name_mphone"),
    "jaccard(r.company_name, l.company_name) >= .9 and l.loc_of_incorporation = r.loc_of_incorporation",
    "substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .8",
    # "substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation"
]

linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.85)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Probability two random records match is estimated to be  0.000689.
This means that amongst all possible pairwise record comparisons, one in 1,452.36 are expected to match.  With 1,365,709,548 total possible comparisons, we expect a total of around 940,336.47 matching pairs


In [113]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e7)

----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - company_name (no m values are trained).
    - loc_of_incorporation (no m values are trained).


In [114]:
training_blocking_rule = block_on("company_name_mphone", "company_name_mphone")
training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."company_name_mphone" = r."company_name_mphone") AND (l."company_name_mphone" = r."company_name_mphone")

Parameter estimates will be made for the following comparison(s):
    - company_name
    - loc_of_incorporation

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.213 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`
Iteration 2: Largest change in params was 0.243 in the m_probability of loc_of_incorporation, level `All other comparisons`
Iteration 3: Largest change in params was 0.0314 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.88`
Iteration 4: Largest change in params was 0.0052 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`
Iteration 5: Larges

In [115]:
linker.visualisations.match_weights_chart()

In [116]:
linker.visualisations.m_u_parameters_chart()

In [None]:
settings = linker.misc.save_model_to_json(
    "../sec_ex21_model_settings/2023_model.json", overwrite=True
)

Log model in MLFlow.

## Make predictions

In [120]:
df_predictions = linker.inference.predict(threshold_match_probability=0.5)

Blocking time: 0.37 seconds


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Predict time: 78.84 seconds


In [121]:
preds_df = df_predictions.as_pandas_dataframe()

In [123]:
preds_df.sort_values(by="match_probability")

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,record_id_l,record_id_r,company_name_l,company_name_r,gamma_company_name,tf_company_name_l,tf_company_name_r,bf_company_name,bf_tf_adj_company_name,loc_of_incorporation_l,loc_of_incorporation_r,gamma_loc_of_incorporation,tf_loc_of_incorporation_l,tf_loc_of_incorporation_r,bf_loc_of_incorporation,bf_tf_adj_loc_of_incorporation,report_year_l,report_year_r,company_name_mphone_l,company_name_mphone_r
5374,0.008914,0.501545,__splink__input_table_0,__splink__input_table_1,6916,7681,manitowoc co incorporated,"manitowoc crane companies, llc mcg",1,0.000005,0.000005,12.534319,1.000000,wisconsin,wisconsin,3,0.004100,0.004100,2.32178,50.180785,2023,2023,MNTWK K INKRPRTT,MNTWK KRN KMPNS LK MKK
1452,0.008914,0.501545,__splink__input_table_0,__splink__input_table_1,3995,1003,"schneider national, incorporated","33.schneider logistics, incorporated",1,0.000005,0.000005,12.534319,1.000000,wisconsin,wisconsin,3,0.004100,0.004100,2.32178,50.180785,2023,2023,SXNTR NXNL INKRPRTT,SXNTR LJSTKS INKRPRTT
4185,0.008914,0.501545,__splink__input_table_0,__splink__input_table_1,485,6819,wisconsin electric power company,wisconsin energy capital corporation,1,0.000010,0.000005,12.534319,1.000000,wisconsin,wisconsin,3,0.004100,0.004100,2.32178,50.180785,2023,2023,WSKNSN ELKTRK PWR KMPN,WSKNSN ENRJ KPTL KRPRXN
3907,0.008914,0.501545,__splink__input_table_0,__splink__input_table_1,1836,1390,"orion energy systems, incorporated","wilson funeral home, incorporated",1,0.000005,0.000005,12.534319,1.000000,wisconsin,wisconsin,3,0.004100,0.004100,2.32178,50.180785,2023,2023,ORN ENRJ SSTMS INKRPRTT,WLSN FNRL HM INKRPRTT
1426,0.008914,0.501545,__splink__input_table_0,__splink__input_table_1,3995,1010,"schneider national, incorporated","40.schneider resources, incorporated",1,0.000005,0.000005,12.534319,1.000000,wisconsin,wisconsin,3,0.004100,0.004100,2.32178,50.180785,2023,2023,SXNTR NXNL INKRPRTT,SXNTR RSRSS INKRPRTT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4672,13.232266,0.999896,__splink__input_table_0,__splink__input_table_1,6568,4608,wesbanco incorporated,"wesbanco, incorporated",3,0.000005,0.000005,35295.437753,1.000000,west virginia,west virginia,3,0.001207,0.001207,2.32178,170.429672,2023,2023,WSBNK INKRPRTT,WSBNK INKRPRTT
1829,13.257062,0.999898,__splink__input_table_0,__splink__input_table_1,497,4974,berkshire hathaway energy company,berkshire hathaway energy company,4,0.000010,0.000010,695779.273116,0.053272,iowa,iowa,3,0.001246,0.001246,2.32178,165.103745,2023,2023,BRKXR H0W ENRJ KMPN,BRKXR H0W ENRJ KMPN
6458,13.550873,0.999917,__splink__input_table_0,__splink__input_table_1,3842,749,"shiftpixy, incorporated","shiftpixy labs, incorporated",3,0.000005,0.000005,35295.437753,1.000000,wyoming,wyoming,3,0.000968,0.000968,2.32178,212.547350,2023,2023,XFTPKS INKRPRTT,XFTPKS LBS INKRPRTT
1330,13.621474,0.999921,__splink__input_table_0,__splink__input_table_1,4088,476,"securetech innovations, incorporated","securetech innovations, incorporated",4,0.000010,0.000010,695779.273116,0.053272,wyoming,wyoming,3,0.000968,0.000968,2.32178,212.547350,2023,2023,SKRTX INFXNS INKRPRTT,SKRTX INFXNS INKRPRTT


In [238]:
preds_df.columns

Index(['match_weight', 'match_probability', 'source_dataset_l',
       'source_dataset_r', 'record_id_l', 'record_id_r', 'company_name_l',
       'company_name_r', 'gamma_company_name', 'tf_company_name_l',
       'tf_company_name_r', 'bf_company_name', 'bf_tf_adj_company_name',
       'loc_of_incorporation_l', 'loc_of_incorporation_r',
       'gamma_loc_of_incorporation', 'tf_loc_of_incorporation_l',
       'tf_loc_of_incorporation_r', 'bf_loc_of_incorporation',
       'bf_tf_adj_loc_of_incorporation', 'company_name_mphone_l',
       'company_name_mphone_r', 'report_year_l', 'report_year_r'],
      dtype='object')

In [249]:
preds_df[preds_df.match_probability >= .5][["match_probability", "company_name_l", "company_name_r", "loc_of_incorporation_l", "loc_of_incorporation_r", "company_name_mphone_l", "company_name_mphone_r"]].iloc[150:200]

Unnamed: 0,match_probability,company_name_l,company_name_r,loc_of_incorporation_l,loc_of_incorporation_r,company_name_mphone_l,company_name_mphone_r
150,0.996128,santander drive auto receivables trust 2018-1,santander drive auto receivables trust,delaware,delaware,SNTNTR TRF AT RSFBLS TRST,SNTNTR TRF AT RSFBLS TRST
151,0.996128,santander drive auto receivables trust 2018-5,santander drive auto receivables trust,delaware,delaware,SNTNTR TRF AT RSFBLS TRST,SNTNTR TRF AT RSFBLS TRST
152,0.996128,santander drive auto receivables trust 2018-3,santander drive auto receivables trust,delaware,delaware,SNTNTR TRF AT RSFBLS TRST,SNTNTR TRF AT RSFBLS TRST
153,0.996128,santander drive auto receivables trust 2016-1,santander drive auto receivables trust,delaware,delaware,SNTNTR TRF AT RSFBLS TRST,SNTNTR TRF AT RSFBLS TRST
154,0.573277,constellation pharmaceuticals inc,"constellation connect, llc",delaware,delaware,KNSTLXN FRMSTKLS INK,KNSTLXN KNKT LK
162,0.959568,"consolidated communications holdings, inc.",consolidated communications of,delaware,illinois,KNSLTTT KMNKXNS HLTNKS INK,KNSLTTT KMNKXNS OF
163,0.959568,"consolidated communications holdings, inc.",consolidated communications of,delaware,missouri,KNSLTTT KMNKXNS HLTNKS INK,KNSLTTT KMNKXNS OF
164,0.959568,"consolidated communications holdings, inc.",consolidated communications of,delaware,maine,KNSLTTT KMNKXNS HLTNKS INK,KNSLTTT KMNKXNS OF
165,0.959568,"consolidated communications holdings, inc.",consolidated communications of,delaware,kansas,KNSLTTT KMNKXNS HLTNKS INK,KNSLTTT KMNKXNS OF
166,0.959568,"consolidated communications holdings, inc.",consolidated communications of,delaware,minnesota,KNSLTTT KMNKXNS HLTNKS INK,KNSLTTT KMNKXNS OF
