# Setup

In [1]:
import duckdb
from splink import Linker, DuckDBAPI, SettingsCreator, block_on
import splink.comparison_library as cl
import splink.comparison_level_library as cll

In [2]:
con = duckdb.connect()

df_l = con.read_parquet("./perturbed_data_l.parquet")
df_r = con.read_parquet("./perturbed_data_r.parquet")

con.register('df_l', df_l)
con.register('df_r', df_r)

<_duckdb.DuckDBPyConnection at 0x2aa0b9b1c70>

In [3]:
geography_comparison = cl.CustomComparison(
    comparison_levels=[
        cll.And(cll.NullLevel("address"), cll.NullLevel("sa4")),
        cll.ArrayIntersectLevel("address", 1),
        cll.ArrayIntersectLevel("sa4", 1),
        cll.ElseLevel(),
    ],
    output_column_name="geography",
)

dob_comparison = cl.CustomComparison(
    comparison_levels=[
        cll.NullLevel("dob"),
        cll.ExactMatchLevel("dob"),
        cll.LevenshteinLevel("cast(dob as string)", 1),
        cll.AbsoluteDateDifferenceLevel("dob", input_is_string=False, threshold=1, metric="month"),
        cll.AbsoluteDateDifferenceLevel("dob", input_is_string=False, threshold=1, metric="year"),
        cll.ElseLevel(),
    ],
    output_column_name="dob",
)

settings = SettingsCreator(
    link_type="link_and_dedupe",
    comparisons=[
        cl.JaroWinklerAtThresholds("fname", [0.9]),
        cl.JaroWinklerAtThresholds("sname", [0.9]),
        cl.ExactMatch("sex"),
        dob_comparison,
        geography_comparison,
    ],
    blocking_rules_to_generate_predictions=[
        block_on("fname", "sname"),
        block_on("dob"),
        block_on("address", arrays_to_explode=["address"]),
    ],
    retain_intermediate_calculation_columns=True,
    retain_matching_columns=True,
    probability_two_random_records_match=1/92700
)

linker = Linker(
    input_table_or_tables=['df_l', 'df_r'],
    input_table_aliases=['df_l', 'df_r'],
    settings=settings,
    db_api=DuckDBAPI(con),
)

linker.training.estimate_u_using_random_sampling(max_pairs=1e7)

ts1 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("dob"),
    estimate_without_term_frequencies=True,
)

ts2 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("fname", "substring(sname, 1, 2)"),
    estimate_without_term_frequencies=True,
)

ts3 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("fname", "address[1]"),
    estimate_without_term_frequencies=True,
)

ts4 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("sname", "sa4[1]"),
    estimate_without_term_frequencies=True,
)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - fname (no m values are trained).
    - sname (no m values are trained).
    - sex (no m values are trained).
    - dob (no m values are trained).
    - geography (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."dob" = r."dob"

Parameter estimates will be made for the following comparison(s):
    - fname
    - sname
    - sex
    - geography

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - dob

Iteration 1: Largest change in params was -0.349 in the m_probability of sname, level `Exact match on sname`
Iteration 2: Largest change in params was 0.0594 in the m_probability of sname, level `All other comparisons`
Iteration 3: Largest change in params was 0.00748 in t

In [4]:
linker.visualisations.parameter_estimate_comparisons_chart()

In [5]:
linker.visualisations.match_weights_chart()

  return _tp.from_dict(dct, validate=validate)


In [6]:
linker.misc.save_model_to_json("./model.json", overwrite=True)

{'link_type': 'link_and_dedupe',
 'probability_two_random_records_match': 1.0787486515641855e-05,
 'retain_matching_columns': True,
 'retain_intermediate_calculation_columns': True,
 'additional_columns_to_retain': [],
 'sql_dialect': 'duckdb',
 'linker_uid': 'bpymq0oe',
 'em_convergence': 0.0001,
 'max_iterations': 25,
 'bayes_factor_column_prefix': 'bf_',
 'term_frequency_adjustment_column_prefix': 'tf_',
 'comparison_vector_value_column_prefix': 'gamma_',
 'unique_id_column_name': 'unique_id',
 'source_dataset_column_name': 'source_dataset',
 'blocking_rules_to_generate_predictions': [{'blocking_rule': '(l."fname" = r."fname") AND (l."sname" = r."sname")',
   'sql_dialect': 'duckdb'},
  {'blocking_rule': 'l."dob" = r."dob"', 'sql_dialect': 'duckdb'},
  {'blocking_rule': 'l."address" = r."address"',
   'sql_dialect': 'duckdb',
   'arrays_to_explode': ['address']}],
 'comparisons': [{'output_column_name': 'fname',
   'comparison_levels': [{'sql_condition': '"fname_l" IS NULL OR "fname