In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

DATA_PATH = "dedup_data.csv"

df = pd.read_csv(DATA_PATH)
df.head()


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,id
0,mitchell,green,7.0,wallaby place,delmar,cleveland,2119,sa,19560409.0,1804974,id8413098058
1,harley,mccarthy,177.0,pridhamstreet,milton,marsden,3165,nsw,19080419.0,6089216,id0238210623
2,madeline,mason,54.0,hoseason street,lakefront retrmnt vlge,granville,4881,nsw,19081128.0,2185997,id3291377902
3,isabelle,,23.0,gundulu place,currin ga,utakarra,2193,wa,19921119.0,4314184,id2458107039
4,taylor,hathaway,7.0,yuranigh court,brentwood vlge,,4220,nsw,19991207.0,9144092,id3341618803


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   given_name     4844 non-null   object 
 1   surname        4921 non-null   object 
 2   street_number  4755 non-null   float64
 3   address_1      4846 non-null   object 
 4   address_2      4307 non-null   object 
 5   suburb         4915 non-null   object 
 6   postcode       5000 non-null   int64  
 7   state          4915 non-null   object 
 8   date_of_birth  4845 non-null   float64
 9   soc_sec_id     5000 non-null   int64  
 10  id             5000 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 429.8+ KB


In [3]:
df.isna().sum()


given_name       156
surname           79
street_number    245
address_1        154
address_2        693
suburb            85
postcode           0
state             85
date_of_birth    155
soc_sec_id         0
id                 0
dtype: int64

In [4]:
%pip install rapidfuzz networkx --quiet


Note: you may need to restart the kernel to use updated packages.


In [5]:
from rapidfuzz import fuzz
import networkx as nx
import re

TEXT_COLUMNS = [
    "given_name",
    "surname",
    "address_1",
    "address_2",
    "suburb",
    "state",
]

NUMERIC_COLUMNS = [
    "street_number",
    "postcode",
]

IDENTIFIER_COLUMNS = ["id"]
META_COLUMNS = IDENTIFIER_COLUMNS

def normalize_text(value: str) -> str:
    if pd.isna(value):
        return ""
    value = value.strip().lower()
    value = re.sub(r"[^a-z0-9\s]", "", value)
    value = re.sub(r"\s+", " ", value)
    return value

df_clean = df.copy()
for col in TEXT_COLUMNS:
    df_clean[col] = df_clean[col].map(normalize_text)

for col in NUMERIC_COLUMNS:
    df_clean[col] = df_clean[col].fillna(0).astype(str).str.strip()

df_clean["date_of_birth"] = pd.to_datetime(df_clean["date_of_birth"], errors="coerce")
df_clean["dob_str"] = df_clean["date_of_birth"].dt.strftime("%Y-%m-%d").fillna("")
df_clean["soc_sec_id"] = df_clean["soc_sec_id"].fillna("").astype(str).str.strip()

df_clean.head()


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,id,dob_str
0,mitchell,green,7.0,wallaby place,delmar,cleveland,2119,sa,1970-01-01 00:00:00.019560409,1804974,id8413098058,1970-01-01
1,harley,mccarthy,177.0,pridhamstreet,milton,marsden,3165,nsw,1970-01-01 00:00:00.019080419,6089216,id0238210623,1970-01-01
2,madeline,mason,54.0,hoseason street,lakefront retrmnt vlge,granville,4881,nsw,1970-01-01 00:00:00.019081128,2185997,id3291377902,1970-01-01
3,isabelle,,23.0,gundulu place,currin ga,utakarra,2193,wa,1970-01-01 00:00:00.019921119,4314184,id2458107039,1970-01-01
4,taylor,hathaway,7.0,yuranigh court,brentwood vlge,,4220,nsw,1970-01-01 00:00:00.019991207,9144092,id3341618803,1970-01-01


In [6]:
from collections import defaultdict
from itertools import combinations


def build_blocks(data: pd.DataFrame, max_block_size: int = 150) -> dict:
    blocks: dict[str, list[int]] = defaultdict(list)
    for idx, row in data.iterrows():
        row_blocks = set()
        gid = row["id"]

        dob = row["dob_str"]
        given = row["given_name"]
        surname = row["surname"]
        postcode = row["postcode"]
        suburb = row["suburb"]
        state = row["state"]
        addr1 = row["address_1"]
        addr2 = row["address_2"]
        street = row["street_number"]
        ssn = row["soc_sec_id"]

        if ssn:
            row_blocks.add(f"ssn::{ssn}")

        if dob:
            row_blocks.add(f"dob::{dob}")
            if surname:
                row_blocks.add(f"dob_surname::{dob}::{surname[:4]}")
            if given:
                row_blocks.add(f"dob_given::{dob}::{given[:4]}")

        if postcode:
            row_blocks.add(f"postcode::{postcode}")
            if surname:
                row_blocks.add(f"surname_postcode::{surname[:5]}::{postcode}")

        if suburb:
            row_blocks.add(f"suburb::{suburb[:6]}")
            if street:
                row_blocks.add(f"suburb_street::{suburb[:6]}::{street}")

        if addr1:
            row_blocks.add(f"address1::{addr1[:8]}")

        if addr2:
            row_blocks.add(f"address2::{addr2[:6]}")

        if given and surname:
            row_blocks.add(f"name::{given[:4]}::{surname[:4]}")

        for key in row_blocks:
            blocks[key].append(gid)

    # prune oversized blocks
    return {k: v for k, v in blocks.items() if 1 < len(v) <= max_block_size}


blocks = build_blocks(df_clean)
len(blocks)


8510

In [7]:
def generate_candidate_pairs(blocks: dict[str, list[int]]) -> set[tuple[int, int]]:
    candidates: set[tuple[int, int]] = set()
    for records in blocks.values():
        if len(records) < 2:
            continue
        for a, b in combinations(sorted(records), 2):
            candidates.add((a, b))
    return candidates


candidate_pairs = generate_candidate_pairs(blocks)
len(candidate_pairs)


137643

In [8]:
record_lookup = df_clean.set_index("id")


def string_similarity(a: str, b: str) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return fuzz.token_set_ratio(a, b) / 100.0


def short_string_similarity(a: str, b: str) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return fuzz.partial_ratio(a, b) / 100.0


def exact_match_score(a: str, b: str) -> float:
    return float(a == b and a != "")


def dob_score(a: str, b: str) -> float:
    if not a and not b:
        return 0.0
    if not a or not b:
        return 0.0
    return float(a == b)


def compute_pair_features(id_a: int, id_b: int) -> dict:
    row_a = record_lookup.loc[id_a]
    row_b = record_lookup.loc[id_b]

    features = {
        "given_name": string_similarity(row_a["given_name"], row_b["given_name"]),
        "surname": string_similarity(row_a["surname"], row_b["surname"]),
        "address_1": string_similarity(row_a["address_1"], row_b["address_1"]),
        "address_2": string_similarity(row_a["address_2"], row_b["address_2"]),
        "suburb": string_similarity(row_a["suburb"], row_b["suburb"]),
        "state": short_string_similarity(row_a["state"], row_b["state"]),
        "street_number": exact_match_score(row_a["street_number"], row_b["street_number"]),
        "postcode": exact_match_score(row_a["postcode"], row_b["postcode"]),
        "dob": dob_score(row_a["dob_str"], row_b["dob_str"]),
        "soc_sec_id": exact_match_score(row_a["soc_sec_id"], row_b["soc_sec_id"]),
    }
    return features


def score_pair(features: dict) -> float:
    weights = {
        "given_name": 0.15,
        "surname": 0.2,
        "address_1": 0.15,
        "address_2": 0.05,
        "suburb": 0.1,
        "state": 0.05,
        "street_number": 0.05,
        "postcode": 0.1,
        "dob": 0.1,
        "soc_sec_id": 0.05,
    }
    total = 0.0
    weight_sum = 0.0
    for key, weight in weights.items():
        value = features[key]
        if np.isnan(value):
            continue
        total += weight * value
        weight_sum += weight
    return total / weight_sum if weight_sum else 0.0


# quick smoke-test on a sample of candidate pairs to inspect score distribution
sampled_pairs = list(candidate_pairs)[:10]
[score_pair(compute_pair_features(a, b)) for a, b in sampled_pairs]


[0.40274193548387105,
 0.33699567099567096,
 0.4081318681318682,
 0.30829248366013073,
 0.43836726998491704,
 0.3946832579185521,
 0.355669413919414,
 0.28687880746704275,
 0.4361111111111111,
 0.30448051948051946]

In [9]:
# Ensure each id maps to a single record in lookups
record_lookup = df_clean.groupby("id", as_index=True).first()


In [10]:
def score_candidates(pairs: set[tuple[int, int]]) -> pd.DataFrame:
    rows = []
    for a, b in pairs:
        features = compute_pair_features(a, b)
        score = score_pair(features)
        row = {"id_a": a, "id_b": b, "score": score}
        row.update({f"feat_{k}": v for k, v in features.items()})
        rows.append(row)
    return pd.DataFrame(rows)


pair_scores = score_candidates(candidate_pairs)
pair_scores.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
score,137643.0,0.414112,0.113741,0.074725,0.355831,0.398901,0.442007,1.0
feat_given_name,137643.0,0.520355,0.364013,0.0,0.2,0.375,1.0,1.0
feat_surname,137643.0,0.466399,0.3672,0.0,0.166667,0.307692,1.0,1.0
feat_address_1,137643.0,0.405192,0.232825,0.0,0.266667,0.344828,0.518519,1.0
feat_address_2,137643.0,0.28397,0.26915,0.0,0.111111,0.25,0.346154,1.0
feat_suburb,137643.0,0.358656,0.275615,0.0,0.190476,0.272727,0.384615,1.0
feat_state,137643.0,0.350286,0.426212,0.0,0.0,0.0,0.666667,1.0
feat_street_number,137643.0,0.049868,0.217673,0.0,0.0,0.0,0.0,1.0
feat_postcode,137643.0,0.117078,0.321515,0.0,0.0,0.0,0.0,1.0
feat_dob,137643.0,0.981859,0.133462,0.0,1.0,1.0,1.0,1.0


In [11]:
pair_scores["score"].quantile([0.5, 0.75, 0.9, 0.95, 0.98, 0.99])


0.50    0.398901
0.75    0.442007
0.90    0.488044
0.95    0.549212
0.98    0.877273
0.99    0.940000
Name: score, dtype: float64

In [12]:
pair_scores.sort_values("score", ascending=False).head(10)


Unnamed: 0,id_a,id_b,score,feat_given_name,feat_surname,feat_address_1,feat_address_2,feat_suburb,feat_state,feat_street_number,feat_postcode,feat_dob,feat_soc_sec_id
58721,id2771151286,id7564789037,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
77770,id1134996395,id7564789037,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
69615,id3858736066,id9891489195,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
36037,id0037695494,id6774912021,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
53511,id5135938584,id5849143190,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
44531,id5164018035,id7878381376,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
26112,id1007588121,id8319904244,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
13750,id8065413782,id8630422755,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
51240,id0501721039,id7214885021,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
87042,id0331654255,id5895123624,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
THRESHOLD = 0.82

matching_pairs = pair_scores[pair_scores["score"] >= THRESHOLD][["id_a", "id_b", "score"]]
matching_pairs.head()


Unnamed: 0,id_a,id_b,score
64,id1530352515,id1792861605,0.981667
173,id4831876888,id5050531950,0.988889
174,id0901159270,id6605247529,0.964706
252,id2873064750,id4685037158,0.888333
288,id0763232915,id2152618654,0.885714


In [14]:
len(matching_pairs)


3925

In [15]:
graph = nx.Graph()

graph.add_nodes_from(df_clean["id"].tolist())
graph.add_weighted_edges_from(matching_pairs.itertuples(index=False, name=None))

components = list(nx.connected_components(graph))
len(components)


2650

In [16]:
# Create prediction format directly: id, group_id
cluster_assignments = []
for cluster_idx, component in enumerate(components, start=0):
    for record_id in component:
        cluster_assignments.append({"id": record_id, "group_id": f"group_{cluster_idx}"})

clusters_df = pd.DataFrame(cluster_assignments)
clusters_df.head()


Unnamed: 0,id,group_id
0,id8413098058,group_0
1,id6520273146,group_1
2,id8143226895,group_1
3,id9846937444,group_1
4,id7377573832,group_1


In [18]:
# Create prediction dataframe in the required format
df_with_clusters = df.merge(clusters_df, on="id", how="left")

# Handle any records not in any cluster (shouldn't happen, but just in case)
# Assign them unique group_ids starting from the next available number
if df_with_clusters["group_id"].isna().any():
    next_group_num = len(components)
    for idx in df_with_clusters[df_with_clusters["group_id"].isna()].index:
        df_with_clusters.loc[idx, "group_id"] = f"group_{next_group_num}"
        next_group_num += 1

# Keep only id and group_id columns for prediction format
df_with_clusters = df_with_clusters[["id", "group_id"]]
df_with_clusters.head(10)


Unnamed: 0,id,group_id
0,id8413098058,group_0
1,id0238210623,group_1
2,id3291377902,group_2
3,id2458107039,group_3
4,id3341618803,group_4
5,id0914241922,group_5
6,id3477271049,group_6
7,id0006375933,group_7
8,id3782870424,group_8
9,id5536636731,group_9


In [19]:
cluster_sizes = df_with_clusters.groupby("group_id").size().reset_index(name="count").sort_values("count", ascending=False)
cluster_sizes.head(10)


Unnamed: 0,group_id,count
795,group_1713,6
2023,group_434,6
1080,group_1970,6
2062,group_47,6
2119,group_520,6
2537,group_898,6
414,group_1370,6
2015,group_427,6
1889,group_313,6
432,group_1387,6


In [20]:
multi_record_clusters = cluster_sizes[cluster_sizes["count"] > 1]
len(multi_record_clusters), multi_record_clusters.head()


(1068,
         group_id  count
 795   group_1713      6
 2023   group_434      6
 1080  group_1970      6
 2062    group_47      6
 2119   group_520      6)

In [22]:
example_cluster_id = multi_record_clusters.iloc[0]["group_id"]
df_with_clusters[df_with_clusters["group_id"] == example_cluster_id].sort_values("id")


Unnamed: 0,id,group_id
4982,id2276785158,group_1713
3360,id4104455227,group_1713
3502,id4256160746,group_1713
3921,id6158154643,group_1713
4204,id6860369956,group_1713
2502,id8842350966,group_1713


In [23]:
OUTPUT_PATH = "prediction.csv"
df_with_clusters.to_csv(OUTPUT_PATH, index=False)
OUTPUT_PATH


'prediction.csv'

## Notes on Deduplication Strategy

- **Preprocessing**: Normalized text fields (casefolding, punctuation stripping) and standardized numeric/date identifiers.
- **Blocking**: Generated multiple blocking keys (DOB, postcode, name fragments, SSN, suburb) to limit candidate pairs while capturing potential duplicates.
- **Scoring**: Used RapidFuzz similarity metrics and rule-based exact matches to compute a weighted similarity score per candidate pair.
- **Clustering**: Treated high-scoring pairs as edges in a graph and used connected components to assign cluster IDs representing deduplicated persons.
- **Output**: Saved enriched dataset with `cluster_id` assignments to `dedup_clusters.csv` for downstream use.
