In [None]:
from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
from splink.duckdb.blocking_rule_library import block_on
from splink.datasets import splink_datasets
import pandas as pd
from utils import format_model_m_and_us


In [None]:
df = splink_datasets.historical_50k
df1 = df.sample(10000)

merged_df = pd.merge(df, df1, how='outer', indicator=True)

# Filter out rows that are only in the subset DataFrame
subset_df = merged_df[merged_df['_merge'] == 'right_only'].drop('_merge', axis=1)

# Filter out rows that are in the subset DataFrame from the original DataFrame
df = merged_df[merged_df['_merge'] == 'left_only'].drop('_merge', axis=1)

df2 = df.sample(10000)

In [None]:
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        cl.exact_match("first_name"),
        cl.exact_match("surname"),
        cl.exact_match("dob"),
        cl.exact_match("birth_place", term_frequency_adjustments=True),
    ],
}

settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        cl.levenshtein_at_thresholds("first_name", [2]),
        cl.levenshtein_at_thresholds("surname", [2]),
        cl.levenshtein_at_thresholds("dob", [2]),
        cl.levenshtein_at_thresholds("birth_place", [2]),
    ],
}

In [None]:
linker = DuckDBLinker([df1,df2], settings)
linker.estimate_u_using_random_sampling(max_pairs=1e6)

In [None]:
blocking_rule_for_training = block_on(["first_name", "surname"])
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)

blocking_rule_for_training = block_on("substr(dob, 1, 4)")  # block on year
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)

In [None]:
linker.m_u_parameters_chart()

In [None]:
model = linker.save_settings_to_json()

In [None]:
m_and_us_df = format_model_m_and_us(linker)
# m_and_us_df.to_csv('outputs/linking.csv')
m_and_us_df

In [None]:
pairwise_predictions = linker.predict()

In [None]:
clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters.as_pandas_dataframe(limit=5)