In [None]:
from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl
from splink.duckdb.blocking_rule_library import block_on
from splink.datasets import splink_datasets
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import matplotlib as mpl
from utils import format_model_m_and_us,plot_m_and_u

In [None]:
df = splink_datasets.historical_50k
df = df.sample(40000)

In [None]:
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True,invalid_dates_as_null=True),
        cl.jaro_winkler_at_thresholds("birth_place", term_frequency_adjustments=True),
    ],
}

In [None]:
linker = DuckDBLinker(df, settings)
linker.estimate_u_using_random_sampling(max_pairs=1e6)

In [None]:
blocking_rule_for_training = block_on(["first_name", "surname"])
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)

blocking_rule_for_training = block_on("substr(dob, 1, 4)")  # block on year
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)

In [None]:
linker.m_u_parameters_chart()

In [None]:
model = linker.save_settings_to_json()
model.keys()

In [None]:
m_and_us_df = format_model_m_and_us()
m_and_us_df.to_csv('outputs/less_data.csv')

In [None]:
pairwise_predictions = linker.predict()

In [None]:
clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters.as_pandas_dataframe(limit=5)