# Setup

In [1]:
import duckdb
from splink import Linker, DuckDBAPI, SettingsCreator, block_on
import splink.comparison_library as cl
import splink.comparison_level_library as cll

In [2]:
con = duckdb.connect()

df_l = con.read_parquet("./perturbed_data_l.parquet")
df_r = con.read_parquet("./perturbed_data_r.parquet")

con.register('df_l', df_l)
con.register('df_r', df_r)

<_duckdb.DuckDBPyConnection at 0x1809b401ef0>

In [3]:
geography_comparison = cl.CustomComparison(
    comparison_levels=[
        cll.And(cll.NullLevel("address"), cll.NullLevel("sa4")),
        cll.ArrayIntersectLevel("address", 1),
        cll.ArrayIntersectLevel("sa4", 1),
        cll.ElseLevel(),
    ],
    output_column_name="geography",
)

dob_comparison = cl.CustomComparison(
    comparison_levels=[
        cll.NullLevel("dob"),
        cll.ExactMatchLevel("dob"),
        cll.LevenshteinLevel("cast(dob as string)", 1),
        cll.AbsoluteDateDifferenceLevel("dob", input_is_string=False, threshold=1, metric="month"),
        cll.AbsoluteDateDifferenceLevel("dob", input_is_string=False, threshold=1, metric="year"),
        cll.ElseLevel(),
    ],
    output_column_name="dob",
)

settings = SettingsCreator(
    link_type="link_and_dedupe",
    comparisons=[
        cl.JaroWinklerAtThresholds("fname", [0.9]),
        cl.JaroWinklerAtThresholds("sname", [0.9]),
        cl.ExactMatch("sex"),
        dob_comparison,
        geography_comparison,
    ],
    blocking_rules_to_generate_predictions=[
        block_on("fname", "sname"),
        block_on("dob"),
        block_on("address", arrays_to_explode=["address"]),
    ],
    retain_intermediate_calculation_columns=True,
    retain_matching_columns=True,
    probability_two_random_records_match=1/92700
)

linker = Linker(
    input_table_or_tables=['df_l', 'df_r'],
    input_table_aliases=['df_l', 'df_r'],
    settings=settings,
    db_api=DuckDBAPI(con),
)

linker.training.estimate_u_using_random_sampling(max_pairs=1e7)

ts1 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("dob"),
    estimate_without_term_frequencies=True,
)

ts2 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("fname", "substring(sname, 1, 2)"),
    estimate_without_term_frequencies=True,
)

ts3 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("fname", "address[1]"),
    estimate_without_term_frequencies=True,
)

ts4 = linker.training.estimate_parameters_using_expectation_maximisation(
    blocking_rule=block_on("sname", "sa4[1]"),
    estimate_without_term_frequencies=True,
)

SETTINGS VALIDATION: Errors were identified in your settings dictionary. 

Setting: `additional_columns_to_retain`

       - Missing column(s) from input dataframe(s): `"first_name"`, `"surname"`

Invalid Columns(s) in Blocking Rule(s)

    SQL: `(l."first_name" = r."first_name") AND (l."surname" = r."surname")`
       - Missing column(s) from input dataframe(s): `first_name`, `surname`

You may want to verify your settings dictionary has valid inputs in all fields before continuing.
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - fname (no m values are trained).
    - sname (no m values are trained).
    - sex (no m values are trained).
    - dob (no m values are trained).
    - geography (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."dob" = r."dob"

Parameter estimates wil

SplinkException: Error executing the following sql for table `__splink__df_comparison_vectors`(__splink__df_comparison_vectors_8910d563c):
CREATE TABLE __splink__df_comparison_vectors_8910d563c AS
WITH __splink__blocked_id_pairs AS (
  SELECT
    *
  FROM __splink__blocked_id_pairs_6bcc1973f
), __splink__df_concat_with_tf AS (
  SELECT
    *
  FROM __splink__df_concat_with_tf_b1aaafb29
), blocked_with_cols AS (
  SELECT
    "l"."source_dataset" AS "source_dataset_l",
    "r"."source_dataset" AS "source_dataset_r",
    "l"."unique_id" AS "unique_id_l",
    "r"."unique_id" AS "unique_id_r",
    "l"."fname" AS "fname_l",
    "r"."fname" AS "fname_r",
    "l"."sname" AS "sname_l",
    "r"."sname" AS "sname_r",
    "l"."sex" AS "sex_l",
    "r"."sex" AS "sex_r",
    "l"."address" AS "address_l",
    "r"."address" AS "address_r",
    "l"."sa4" AS "sa4_l",
    "r"."sa4" AS "sa4_r",
    "l"."dob" AS "dob_l",
    "r"."dob" AS "dob_r",
    "l"."first_name" AS "first_name_l",
    "r"."first_name" AS "first_name_r",
    "l"."surname" AS "surname_l",
    "r"."surname" AS "surname_r",
    b.match_key
  FROM __splink__df_concat_with_tf AS l
  INNER JOIN __splink__blocked_id_pairs AS b
    ON l."source_dataset" || '-__-' || l."unique_id" = b.join_key_l
  INNER JOIN __splink__df_concat_with_tf AS r
    ON r."source_dataset" || '-__-' || r."unique_id" = b.join_key_r
)
SELECT
  "source_dataset_l",
  "source_dataset_r",
  "unique_id_l",
  "unique_id_r",
  CASE
    WHEN "fname_l" IS NULL OR "fname_r" IS NULL
    THEN -1
    WHEN "fname_l" = "fname_r"
    THEN 2
    WHEN JARO_WINKLER_SIMILARITY("fname_l", "fname_r") >= 0.9
    THEN 1
    ELSE 0
  END AS gamma_fname,
  CASE
    WHEN "sname_l" IS NULL OR "sname_r" IS NULL
    THEN -1
    WHEN "sname_l" = "sname_r"
    THEN 2
    WHEN JARO_WINKLER_SIMILARITY("sname_l", "sname_r") >= 0.9
    THEN 1
    ELSE 0
  END AS gamma_sname,
  CASE
    WHEN "sex_l" IS NULL OR "sex_r" IS NULL
    THEN -1
    WHEN "sex_l" = "sex_r"
    THEN 1
    ELSE 0
  END AS gamma_sex,
  CASE
    WHEN (
      "address_l" IS NULL OR "address_r" IS NULL
    )
    AND (
      "sa4_l" IS NULL OR "sa4_r" IS NULL
    )
    THEN -1
    WHEN ARRAY_LENGTH(LIST_INTERSECT("address_l", "address_r")) >= 1
    THEN 2
    WHEN ARRAY_LENGTH(LIST_INTERSECT("sa4_l", "sa4_r")) >= 1
    THEN 1
    ELSE 0
  END AS gamma_geography,
  match_key
FROM blocked_with_cols

Error was: Binder Error: Values list "l" does not have a column named "first_name"

LINE 27: "l"."first_name" AS "first_name_l", 
         ^

In [None]:
linker.visualisations.parameter_estimate_comparisons_chart()

In [None]:
linker.visualisations.match_weights_chart()