In [1]:
# !pip install splink pyspark rapidfuzz

In [2]:
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id

from splink.backends.spark import similarity_jar_location
from splink import SparkAPI

# Path to your custom JAR file
CUSTOM_JAR_PATH = r"C:\Users\AbhayPandey\Desktop\AP_SS\Note\scala-udf-similarity-0.1.1-shaded.jar"

conf = SparkConf()
conf.set("spark.driver.memory", "12g")
conf.set("spark.default.parallelism", "8")
conf.set("spark.sql.codegen.wholeStage", "false")
conf.set("spark.jars", f"{similarity_jar_location()},{CUSTOM_JAR_PATH}")

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
spark.sparkContext.setCheckpointDir("./tmp_checkpoints")


In [4]:
from pyspark.sql.types import StringType, DoubleType, ArrayType
# from pyspark.sql import callUDF


# Phonetic / normalization
spark.udf.registerJavaFunction("accent_remove", "uk.gov.moj.dash.linkage.AccentRemover", StringType())
spark.udf.registerJavaFunction("double_metaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", StringType())
spark.udf.registerJavaFunction("double_metaphone_alt", "uk.gov.moj.dash.linkage.DoubleMetaphoneAlt", StringType())

# Similarity
spark.udf.registerJavaFunction("cosine_distance", "uk.gov.moj.dash.linkage.CosineDistance", DoubleType())
spark.udf.registerJavaFunction("jaccard_similarity", "uk.gov.moj.dash.linkage.JaccardSimilarity", DoubleType())
spark.udf.registerJavaFunction("jaro_similarity", "uk.gov.moj.dash.linkage.JaroSimilarity", DoubleType())
spark.udf.registerJavaFunction("jaro_winkler_similarity", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", DoubleType())
spark.udf.registerJavaFunction("lev_damerau_distance", "uk.gov.moj.dash.linkage.LevDamerauDistance", DoubleType())

# Tokenisers
spark.udf.registerJavaFunction("qgram_tokeniser", "uk.gov.moj.dash.linkage.QgramTokeniser", StringType())
spark.udf.registerJavaFunction("q2gram_tokeniser", "uk.gov.moj.dash.linkage.Q2gramTokeniser", StringType())
spark.udf.registerJavaFunction("q3gram_tokeniser", "uk.gov.moj.dash.linkage.Q3gramTokeniser", StringType())
spark.udf.registerJavaFunction("q4gram_tokeniser", "uk.gov.moj.dash.linkage.Q4gramTokeniser", StringType())
spark.udf.registerJavaFunction("q5gram_tokeniser", "uk.gov.moj.dash.linkage.Q5gramTokeniser", StringType())
spark.udf.registerJavaFunction("q6gram_tokeniser", "uk.gov.moj.dash.linkage.Q6gramTokeniser", StringType())

# Array / explode helpers
spark.udf.registerJavaFunction("dual_array_explode", "uk.gov.moj.dash.linkage.DualArrayExplode", ArrayType(StringType()))
spark.udf.registerJavaFunction("latlong_explode", "uk.gov.moj.dash.linkage.latlongexplode", ArrayType(StringType()))

# Escaping
spark.udf.registerJavaFunction("sql_escape", "uk.gov.moj.dash.linkage.sqlEscape", StringType())


In [None]:
# spark.udf.registerJavaFunction("accent_remove", "uk.gov.moj.dash.linkage.AccentRemover", "string")
# spark.udf.registerJavaFunction("double_metaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", "string")
# spark.udf.registerJavaFunction("double_metaphone_alt", "uk.gov.moj.dash.linkage.DoubleMetaphoneAlt", "string")
# spark.udf.registerJavaFunction("cosine_distance", "uk.gov.moj.dash.linkage.CosineDistance", "double")
# spark.udf.registerJavaFunction("jaro", "uk.gov.moj.dash.linkage.JaroSimilarity", "double")
# spark.udf.registerJavaFunction("jaro_winkler", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", "double")
# spark.udf.registerJavaFunction("jaccard", "uk.gov.moj.dash.linkage.JaccardSimilarity", "double")
# spark.udf.registerJavaFunction("lev_damerau", "uk.gov.moj.dash.linkage.LevDamerauDistance", "double")
# spark.sql("SELECT accent_remove('José'), double_metaphone('Smith'), jaro('martha','marhta')").show()
# #


In [5]:
csv_path = "data.csv"   # replace with your dataset

df = spark.read.csv(csv_path, header=True, inferSchema=True)
df = df.withColumn("unique_id", monotonically_increasing_id())
df.show(5)

+--------------------+------------------+----------+---------+----------+-----------+--------+------+----------+--------------------+--------------------+-------------+-----+--------------+---------+
|           full_name|first_and_lastname|first_name|last_name|       dob|birth_place|     zip|gender|occupation|               email|               phone|      address| city|       country|unique_id|
+--------------------+------------------+----------+---------+----------+-----------+--------+------+----------+--------------------+--------------------+-------------+-----+--------------+---------+
|thomas clifford, ...|  thomas chudleigh|    thomas|chudleigh|1630-08-01|      devon|tq13 8df|  male|politician|thomaschudleigh24...|          6600795731|devon Streeet|DEVON|United Kingdom|        0|
| thomas of chudleigh|  thomas chudleigh|    thomas|chudleigh|1630-08-01|      devon|tq13 8df|  male|politician|thomaschudleigh51...|        +01-674-3942|    916 devon|DEVON|United Kingdom|        1|


In [6]:
import auto_blocking as ab

db_api = SparkAPI(spark_session=spark)

settings, roles, diagnostics, df_enhanced, deterministic_rules = ab.auto_generate_settings(df, db_api)


print("Roles:", roles)
print("Diagnostics:", diagnostics)


Roles: {'first_name': 'first_name', 'last_name': 'last_name', 'full_name': 'full_name', 'email': 'email', 'phone': 'phone', 'zip': 'zip', 'city': 'city', 'address': 'address', 'date': 'dob'}
Diagnostics: [{'rule': '<splink.internals.blocking_rule_library.ExactMatchRule object at 0x0000011974052190>', 'comparisons': 'error', 'kept': False, 'reason': 'count_comparisons_from_blocking_rule() takes 0 positional arguments but 4 were given'}, {'rule': '<splink.internals.blocking_rule_library.ExactMatchRule object at 0x0000011973FD3A90>', 'comparisons': 'error', 'kept': False, 'reason': 'count_comparisons_from_blocking_rule() takes 0 positional arguments but 4 were given'}, {'rule': '<splink.internals.blocking_rule_library.And object at 0x00000119740523D0>', 'comparisons': 'error', 'kept': False, 'reason': 'count_comparisons_from_blocking_rule() takes 0 positional arguments but 4 were given'}, {'rule': '<splink.internals.blocking_rule_library.And object at 0x0000011974051610>', 'comparisons': 

In [7]:
from splink import Linker

linker = Linker(df_enhanced, settings, db_api=db_api)

# Robust training loop using auto_blocking’s rules
try:
    linker.training.estimate_probability_two_random_records_match(
        deterministic_matching_rules=deterministic_rules,
        recall=0.95
    )
except:
    linker.training.estimate_probability_two_random_records_match(
        deterministic_matching_rules=deterministic_rules,
        recall=1.0
    )

linker.training.estimate_u_using_random_sampling(max_pairs=5e5)

# Train with EM on the first deterministic rule
if deterministic_rules:
    linker.training.estimate_parameters_using_expectation_maximisation(deterministic_rules[0])

Probability two random records match is estimated to be  4.21e-09.
This means that amongst all possible pairwise record comparisons, one in 237,562,828.25 are expected to match.  With 1,500,396,810 total possible comparisons, we expect a total of around 6.32 matching pairs
----- Estimating u probabilities using random sampling -----
u probability not trained for email_norm - Exact match on email_norm (comparison vector value: 4). This usually means the comparison level was never observed in the training data.
u probability not trained for email_norm - Exact match on username (comparison vector value: 3). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name_norm (no m values are trained).
    - last_name_norm (no m values are trained).
    - city_norm (no m values are trained).
    - email_norm (some u values are not trained, no m val

In [8]:
results = linker.inference.predict(threshold_match_probability=0.9)
clusters = results.as_spark_dataframe()

clusters.show(10)

clusters.write.mode("overwrite").csv("splink_predictions.csv", header=True)


Blocking time: 0.00 seconds
Predict time: 8.95 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_norm':
    m values not fully trained
Comparison: 'last_name_norm':
    m values not fully trained
Comparison: 'email_norm':
    m values not fully trained
Comparison: 'email_norm':
    u values not fully trained


+-----------------+------------------+-----------+-----------+-----------------+-----------------+---------------------+--------------------+--------------------+------------------+-------------------------+----------------+----------------+--------------------+--------------------+--------------------+------------------+------------------------+--------------------+--------------------+---------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+--------------------+
|     match_weight| match_probability|unique_id_l|unique_id_r|first_name_norm_l|first_name_norm_r|gamma_first_name_norm|tf_first_name_norm_l|tf_first_name_norm_r|bf_first_name_norm|bf_tf_adj_first_name_norm|last_name_norm_l|last_name_norm_r|gamma_last_name_norm| tf_last_name_norm_l| tf_last_name_norm_r| bf_last_name_norm|bf_tf_adj_last_name_norm|         city_norm_l|       

In [10]:
preds = linker.inference.predict(threshold_match_probability=0.9).as_spark_dataframe()
preds.show(5)


Blocking time: 0.15 seconds
Predict time: 0.48 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_norm':
    m values not fully trained
Comparison: 'last_name_norm':
    m values not fully trained
Comparison: 'email_norm':
    m values not fully trained
Comparison: 'email_norm':
    u values not fully trained


+-----------------+------------------+-----------+-----------+-----------------+-----------------+---------------------+--------------------+--------------------+------------------+-------------------------+----------------+----------------+--------------------+--------------------+--------------------+------------------+------------------------+--------------------+--------------------+---------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+--------------------+
|     match_weight| match_probability|unique_id_l|unique_id_r|first_name_norm_l|first_name_norm_r|gamma_first_name_norm|tf_first_name_norm_l|tf_first_name_norm_r|bf_first_name_norm|bf_tf_adj_first_name_norm|last_name_norm_l|last_name_norm_r|gamma_last_name_norm| tf_last_name_norm_l| tf_last_name_norm_r| bf_last_name_norm|bf_tf_adj_last_name_norm|         city_norm_l|       

In [11]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

def create_cluster_report(df, preds, report_parquet="report.parquet", report_csv="report.csv"):
    # --- Step 1: Edges
    edges = preds.filter(F.col("match_probability") >= 0.9) \
                 .select(F.col("unique_id_l").alias("src"),
                         F.col("unique_id_r").alias("dst"))

    # --- Step 2: Components
    vertices = edges.select("src").union(edges.select("dst")).distinct() \
                   .withColumnRenamed("src", "id") \
                   .withColumn("component", F.col("id"))
    components = vertices
    changed = True
    while changed:
        updated = edges.join(components, edges.src == components.id, "inner") \
                       .select(edges.dst.alias("id"), components.component) \
                       .union(components.select("id", "component")) \
                       .groupBy("id").agg(F.min("component").alias("component"))
        changed = updated.join(components, ["id"], "left") \
                         .filter(updated.component != components.component).count() > 0
        components = updated

    # --- Step 3: Sequential cluster_id
    distinct_clusters = components.select("component").distinct() \
                                  .withColumn("cluster_id",
                                              F.row_number().over(Window.orderBy("component")))
    components = components.join(distinct_clusters, "component", "left")

    # --- Step 4: Join back with original df
    df_with_clusters = df.join(components, df.unique_id == components.id, "left") \
                         .drop("id", "component")

    # --- Step 5: Partition group
    window_spec = Window.partitionBy("cluster_id").orderBy("unique_id")
    df_with_clusters = df_with_clusters.withColumn(
        "partition_group", F.row_number().over(window_spec)
    )

    # --- Step 6: Save
    df_with_clusters.write.mode("overwrite").parquet(report_parquet)
    df_with_clusters.write.mode("overwrite").option("header", True).csv(report_csv)

    print(f"✅ Report saved to {report_parquet} and {report_csv}")
    return df_with_clusters

report_df = create_cluster_report(df, preds, "report.parquet", "report.csv")
report_df.show(5)


✅ Report saved to report.parquet and report.csv
+--------------------+------------------+----------+---------+----------+-----------+--------+------+----------+--------------------+--------------------+-------------+-----+--------------+---------+----------+---------------+
|           full_name|first_and_lastname|first_name|last_name|       dob|birth_place|     zip|gender|occupation|               email|               phone|      address| city|       country|unique_id|cluster_id|partition_group|
+--------------------+------------------+----------+---------+----------+-----------+--------+------+----------+--------------------+--------------------+-------------+-----+--------------+---------+----------+---------------+
|thomas clifford, ...|  thomas chudleigh|    thomas|chudleigh|1630-08-01|      devon|tq13 8df|  male|politician|thomaschudleigh24...|          6600795731|devon Streeet|DEVON|United Kingdom|        0|      NULL|              1|
| thomas of chudleigh|  thomas chudleigh|   

In [12]:
deduped = report_df.groupBy("cluster_id").agg(F.first("unique_id").alias("rep_id"))
deduped.write.mode("overwrite").option("header", True).csv("deduped.csv")

print("✅ Deduped representatives saved to deduped.csv")
deduped.show(5)


✅ Deduped representatives saved to deduped.csv
+----------+----------+
|cluster_id|    rep_id|
+----------+----------+
|      NULL|        26|
|         1|8589936936|
|         3|8589955787|
|         4|8589959691|
|         2|8589941305|
+----------+----------+



In [13]:
from rapidfuzz import fuzz

def check_new_record(new_record: dict, linker, threshold=0.9):
    single_df = spark.createDataFrame([new_record])
    preds = linker.inference.predict_from_dataframes(single_df, threshold_match_probability=threshold)
    pdf = preds.as_pandas_dataframe()
    if len(pdf) == 0:
        return "unique"
    score = max(pdf["match_probability"])
    if score > threshold:
        return "duplicate"
    return "potential_duplicate"

example_record = {"first_name": "John", "last_name": "Smith", "email": "john@example.com"}
print(check_new_record(example_record, linker))


AttributeError: 'LinkerInference' object has no attribute 'predict_from_dataframes'