In [None]:
from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl
from splink.duckdb.blocking_rule_library import block_on
from splink.datasets import splink_datasets
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import matplotlib as mpl



In [None]:
df = splink_datasets.historical_50k
df

In [None]:
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True,invalid_dates_as_null=True),
        cl.jaro_winkler_at_thresholds("birth_place", term_frequency_adjustments=True),
    ],
}

In [None]:
linker = DuckDBLinker(df, settings)
linker.estimate_u_using_random_sampling(max_pairs=1e6)

In [None]:
blocking_rule_for_training = block_on(["first_name", "surname"])
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)

blocking_rule_for_training = block_on("substr(dob, 1, 4)")  # block on year
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)

In [None]:
def format_model_m_and_us():
  model  = linker.save_settings_to_json()
  m_and_us = []
  for variable in model['comparisons']:
    for level in variable['comparison_levels']:
      if 'm_probability' in level:
        m_and_us.append({'variable':variable['output_column_name'],'sql_condition':level['label_for_charts'],'m_probability':level['m_probability']}) 
      if 'u_probability' in level:
        m_and_us[-1]['u_probability'] = level['u_probability']
  m_and_us = pd.DataFrame(m_and_us)
  m_and_us = m_and_us[['variable','sql_condition','m_probability','u_probability']]
  m_and_us['1-u'] = 1-m_and_us['u_probability']
  return m_and_us

# COMMAND ----------

# display m and u parameters
# test_m_and_us = format_model_m_and_us('mps_enhancement_collab','firebreak_splink_models','xz_231016')
# test_m_and_us1 = test_m_and_us[test_m_and_us['sql_condition'] == 'levenshtein <= 2']
# test_m_and_us2 = test_m_and_us[test_m_and_us['sql_condition'] == 'Exact match']

# display(test_m_and_us)

# COMMAND ----------

def plot_m_and_u(df,ax,marker=''):
  groups = df.groupby('variable')
  i=0
  for variable, group in groups:
    ax.scatter(group['1-u'], group.m_probability, c = list(mpl.colors.TABLEAU_COLORS.keys())[i], label=variable,marker=marker)
    ax.set_xlabel('1 - u-probability')
    ax.set_ylabel('m-probability')
    i+=1  

In [None]:
linker.m_u_parameters_chart()

In [None]:
model = linker.save_settings_to_json()
model.keys()

In [None]:
m_and_us_df = format_model_m_and_us()
m_and_us_df.to_csv('first_test.csv')

In [None]:
pairwise_predictions = linker.predict()

In [None]:
clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters.as_pandas_dataframe(limit=5)