In [39]:
from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl
from splink.duckdb.blocking_rule_library import block_on
from splink.datasets import splink_datasets
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import matplotlib as mpl



In [2]:
df = splink_datasets.fake_1000
df

Unnamed: 0,unique_id,first_name,surname,dob,city,email,cluster
0,0,Robert,Alan,1971-06-24,,robert255@smith.net,0
1,1,Robert,Allen,1971-05-24,,roberta25@smith.net,0
2,2,Rob,Allen,1971-06-24,London,roberta25@smith.net,0
3,3,Robert,Alen,1971-06-24,Lonon,,0
4,4,Grace,,1997-04-26,Hull,grace.kelly52@jones.com,1
...,...,...,...,...,...,...,...
995,995,Emma,Lloyd,2018-04-17,nLonon,emmalloyd@jimenez.com,249
996,996,Emma,Lloyd,2018-04-17,London,emmalloyd@jimenez.com,249
997,997,Oliver,Bird,2000-01-27,Reading,o.b@smith.net,250
998,998,Oliver,Bird,2000-02-27,,oliver.b@smith.net,250


In [3]:
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("city", term_frequency_adjustments=True),
        ctl.email_comparison("email", include_username_fuzzy_level=False),
    ],
}

In [4]:
linker = DuckDBLinker(df, settings)
linker.estimate_u_using_random_sampling(max_pairs=1e6)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - city (no m values are trained).
    - email (no m values are trained).


In [5]:
blocking_rule_for_training = block_on(["first_name", "surname"])
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)

blocking_rule_for_training = block_on("substr(dob, 1, 4)")  # block on year
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."first_name" = r."first_name") AND (l."surname" = r."surname")

Parameter estimates will be made for the following comparison(s):
    - dob
    - city
    - email

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Iteration 1: Largest change in params was -0.491 in the m_probability of dob, level `Exact match`
Iteration 2: Largest change in params was 0.119 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0229 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.00494 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.00114 in probability_two_random_records_match
Iteration 6: Largest change in params was 0.000272 in probability_two_random_records_match
Iteration 7: Largest change in params was

<EMTrainingSession, blocking on SUBSTR(l."dob", 1, 4) = SUBSTR(r."dob", 1, 4), deactivating comparisons dob>

In [41]:
def format_model_m_and_us():
  model  = linker.save_settings_to_json()
  m_and_us = []
  for variable in model['comparisons']:
    for level in variable['comparison_levels']:
      if 'm_probability' in level:
        m_and_us.append({'variable':variable['output_column_name'],'sql_condition':level['label_for_charts'],'m_probability':level['m_probability']}) 
      if 'u_probability' in level:
        m_and_us[-1]['u_probability'] = level['u_probability']
  m_and_us = pd.DataFrame(m_and_us)
  m_and_us = m_and_us[['variable','sql_condition','m_probability','u_probability']]
  m_and_us['1-u'] = 1-m_and_us['u_probability']
  return m_and_us

# COMMAND ----------

# display m and u parameters
# test_m_and_us = format_model_m_and_us('mps_enhancement_collab','firebreak_splink_models','xz_231016')
# test_m_and_us1 = test_m_and_us[test_m_and_us['sql_condition'] == 'levenshtein <= 2']
# test_m_and_us2 = test_m_and_us[test_m_and_us['sql_condition'] == 'Exact match']

# display(test_m_and_us)

# COMMAND ----------

def plot_m_and_u(df,ax,marker=''):
  groups = df.groupby('variable')
  i=0
  for variable, group in groups:
    ax.scatter(group['1-u'], group.m_probability, c = list(mpl.colors.TABLEAU_COLORS.keys())[i], label=variable,marker=marker)
    ax.set_xlabel('1 - u-probability')
    ax.set_ylabel('m-probability')
    i+=1  

In [32]:
model = linker.save_settings_to_json()
model.keys()

/tmp/ipykernel_4782/1310832714.py:1: SplinkDeprecated: This function is deprecated. Use save_model_to_json() instead.
  model = linker.save_settings_to_json()


dict_keys(['link_type', 'blocking_rules_to_generate_predictions', 'comparisons', 'sql_dialect', 'linker_uid', 'probability_two_random_records_match'])

In [42]:
m_and_us_df = format_model_m_and_us()
m_and_us_df

/tmp/ipykernel_4782/1360689405.py:2: SplinkDeprecated: This function is deprecated. Use save_model_to_json() instead.
  model  = linker.save_settings_to_json()


Unnamed: 0,variable,sql_condition,m_probability,u_probability,1-u
0,first_name,Exact match first_name,0.555474,0.005794,0.994206
1,first_name,Damerau_levenshtein <= 1,0.20102,0.002366,0.997634
2,first_name,Jaro_winkler_similarity >= 0.9,0.02967,0.001296,0.998704
3,first_name,Jaro_winkler_similarity >= 0.8,0.079039,0.005678,0.994322
4,first_name,All other comparisons,0.134797,0.984867,0.015133
5,surname,Exact match surname,0.493083,0.00489,0.99511
6,surname,Damerau_levenshtein <= 1,0.24539,0.002755,0.997245
7,surname,Jaro_winkler_similarity >= 0.9,0.054987,0.001009,0.998991
8,surname,Jaro_winkler_similarity >= 0.8,0.062944,0.003711,0.996289
9,surname,All other comparisons,0.143597,0.987635,0.012365


In [None]:
pairwise_predictions = linker.predict()

In [None]:
clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters.as_pandas_dataframe(limit=5)