In [1]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.spark.spark_linker import SparkLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
    datediff_at_thresholds,
    jaccard_at_thresholds,
    jaro_winkler_at_thresholds,
)
from splink.spark.spark_comparison_template_library import (
    name_comparison,
)
from splink.spark.spark_comparison_library import (
    datediff_at_thresholds,
)

import pandas as pd

In [5]:
df1 = pd.read_excel('ENTER PATH HERE')
df2 = pd.read_excel('ENTER PATH HERE')

In [2]:
vlist = ['country','state','jobtitle','company_cleaned','startdate','enddate']
df = pd.read_csv('ENTER PATH HERE')[vlist]

In [3]:
df = df.loc[(df.country!='empty') & (df.state!='empty'),:].reset_index(drop=False)

In [None]:
#step1: specify a linkage model
settings = {
    "unique_id_column_name": "index",
    "link_type": "link_only", 
    "comparisons": [
        name_comparison("jobtitle"),
        name_comparison("company_cleaned"),
        datediff_at_thresholds("startdate"),
        datediff_at_thresholds("enddate"),
    ],
    "blocking_rules_to_generate_predictions": [
        "l.country = r.country" and "l.state = r.state",
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}
linker = SparkLinker(df, settings)

In [108]:
###estimate u parameter
linker.estimate_u_using_random_sampling(max_pairs=1000)

----- Estimating u probabilities using random sampling -----
u probability not trained for jobtitle - Exact match (comparison vector value: 3). This usually means the comparison level was never observed in the training data.
u probability not trained for jobtitle - Levenshtein <= 1 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.
u probability not trained for jobtitle - Levenshtein <= 2 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.
u probability not trained for company_cleaned - Exact match (comparison vector value: 3). This usually means the comparison level was never observed in the training data.
u probability not trained for company_cleaned - Levenshtein <= 1 (comparison vector value: 2). This usually means the comparison level was never observed in the training data.
u probability not trained for company_cleaned - Levenshtein <= 2 (comparison vector value:

In [94]:
# estimate m parameter
blocking_rule_for_training = "l.country = r.country and l.state = r.state"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.country = r.country and l.state = r.state

Parameter estimates will be made for the following comparison(s):
    - jobtitle
    - company_cleaned
    - startdate
    - enddate

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.459 in the m_probability of startdate, level `Exact match`
Iteration 2: Largest change in params was -0.361 in the m_probability of enddate, level `Exact match`
Iteration 3: Largest change in params was 0.183 in the m_probability of company_cleaned, level `All other comparisons`
Iteration 4: Largest change in params was -0.145 in the m_probability of jobtitle, level `Exact match`
Iteration 5: Largest change in params was 0.0943 in the m_probability of jobtitle, level `All other comparisons`
Iteration 6: Largest change in params was 0.0647 in the m_probabi

<EMTrainingSession, blocking on l.country = r.country and l.state = r.state, deactivating comparisons >

In [None]:
# predict linkage
df_predictions = linker.predict(threshold_match_probability=0.5)
df_predictions.as_pandas_dataframe(limit=50)

In [None]:
# samples
linker.estimate_u_using_random_sampling(max_pairs=1e6)

blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = "l.dob = r.dob"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

pairwise_predictions = linker.predict()

clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters.as_pandas_dataframe(limit=5)