In [9]:
import pandas as pd
import numpy as np
import string

# Main idea: to represent family structures, you have a table for families which will reference addresses and surnames, 
# a table for twins which will reference DOBs, 
# and then a table for persons which includes sex and fname. 

def generate_data(n, n_family, n_address):
    rng = np.random.default_rng()

    rand_chars = np.array(list(string.ascii_uppercase), dtype="str")

    # random strings between 6 and 10 in length
    sname_frame = pd.DataFrame({
        "sname_id": np.arange(n),
        "sname": np.array([''.join(rng.choice(rand_chars, size=rng.integers(6, 11))) for i in range(n)]),
    })

    # random strings between 4 and 7 in length
    fname_frame = pd.DataFrame({
        "fname_id": np.arange(n),
        "fname": np.array([''.join(rng.choice(rand_chars, size=rng.integers(4, 8))) for i in range(n)]),
    })
    
    address_frame = pd.DataFrame({
        "family_id": rng.integers(low=0, high=n_family, size=n_address),
        "address": rng.integers(low=0, high=n_family, size=n_address),
        "sa4": rng.integers(low=0, high=100, size=n_address),
    })
    
    address_frame["address"] = address_frame["address"].astype("Int64")
    address_frame["sa4"] = address_frame["sa4"].astype("Int64")
    
    family_frame = pd.DataFrame({
        "family_id": np.arange(n_family),
        "sname_id": rng.geometric(0.01, n_family).astype(int),
    })
    
    twin_frame = pd.DataFrame({
        "twin_id": np.arange(n),
        "family_id": rng.integers(low=0, high=n_family, size=n),
        "dob": pd.to_datetime(rng.integers(low=-10000, high=15000, size=n), unit="D"),
    })
    
    person_frame = pd.DataFrame({
        "person_id": np.arange(n),
        "twin_id": np.maximum(np.arange(n) - rng.choice([0, 1], size=n, p=[0.97, 0.03]), 0),
        "fname_id": rng.geometric(0.02, size=n).astype(int),
        "sex": rng.choice(['1', '2'], size=n),
    })
    
    person_frame = pd.merge(person_frame, twin_frame, on=["twin_id"])
    
    person_frame = pd.merge(person_frame, family_frame, on=["family_id"])
    
    person_frame = pd.merge(person_frame, address_frame, on=["family_id"], how="left")
    
    person_frame = pd.merge(person_frame, fname_frame, on=["fname_id"], how="left")
    
    person_frame = pd.merge(person_frame, sname_frame, on=["sname_id"], how="left")
    
    person_frame = person_frame[["person_id", "family_id", "dob", "sex", "fname", "sname", "address", "sa4"]]

    return person_frame


def create_string_errors(series, rate, position=4):
    rng = np.random.default_rng()
    result = series.copy()
    n = series.shape[0]
    rand_chars = np.array(list(string.ascii_uppercase) + [''], dtype="str")
    error_flag = rng.choice([False, True], size=n, p=[1-rate, rate])
    replacement_chars = rng.choice(rand_chars, size=n)
    replacements = result.str.slice(stop=position) + replacement_chars + result.str.slice(start=position+1)
    result.loc[error_flag] = replacements.loc[error_flag]
    return result


def swap_values(series, rate):
    rng = np.random.default_rng()
    result = series.copy()
    n = series.shape[0]
    swap_ids = rng.integers(n, size=int(rate * n))
    donor_ids = rng.integers(n, size=int(rate * n))
    result.iloc[swap_ids] = result.iloc[donor_ids]
    return result


def create_date_typos(series, rate):
    rng = np.random.default_rng()
    n = series.shape[0]
    result_str = series.dt.strftime("%Y%m%d").copy()
    
    month_sub = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

    day_sub = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', 
               '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
               '21', '22', '23', '24', '25', '26', '27', '28']
    
    # month errors
    error_ids = rng.integers(n, size=int(rate / 2 * n))
    replacement_chars = rng.choice(month_sub, size=n)
    replacements = result_str.str.slice(stop=4) + replacement_chars + result_str.str.slice(start=6)
    result_str.iloc[error_ids] = replacements.iloc[error_ids]
    # day errors
    error_ids = rng.integers(n, size=int(rate / 2 * n))
    replacement_chars = rng.choice(day_sub, size=n)
    replacements = result_str.str.slice(stop=6) + replacement_chars + result_str.str.slice(start=8)
    result_str.iloc[error_ids] = replacements.iloc[error_ids]
    result = pd.to_datetime(result_str, format="%Y%m%d", errors="coerce")
    return result


def create_missing_values(series, rate):
    rng = np.random.default_rng()
    n = series.shape[0]
    missing_ids = rng.choice(series.index, size=int(n*rate), replace=False)
    result = series.copy()
    result.iloc[missing_ids] = pd.NA
    return result


def perturb(df, drop_rate, dup_rate):
    # drop some records
    fixed_df = df[["person_id", "family_id", "dob", "sex", "fname", "sname"]].drop_duplicates()

    status_df = df[["person_id", "address", "sa4"]]
    
    n_fixed = fixed_df.shape[0]

    n_status = status_df.shape[0]
    
    fixed_sample = fixed_df.sample(frac=1-drop_rate)

    sample = pd.merge(fixed_sample, status_df, on=["person_id"], how="left")

    # create duplicates
    dup_sample = pd.concat([
        fixed_sample,
        fixed_sample.sample(frac=dup_rate),
    ]).reset_index(drop=True)

    # so that duplicates of the same person_id have a different id
    dup_sample["unique_id"] = dup_sample.index

    # create errors in the dup_sample columns
    dup_sample["fname"] = create_string_errors(dup_sample["fname"], rate=0.1)
    dup_sample["sname"] = swap_values(dup_sample["sname"], rate=0.2)
    dup_sample["sname"] = create_string_errors(dup_sample["sname"], rate=0.1)
    dup_sample["dob"] = create_date_typos(dup_sample["dob"], rate=0.05)

    # introduce missingness
    dup_sample["fname"] = create_missing_values(dup_sample["fname"], rate=0.03)
    dup_sample["sname"] = create_missing_values(dup_sample["sname"], rate=0.01)
    dup_sample["dob"] = create_missing_values(dup_sample["dob"], rate=0.01)
    dup_sample["sex"] = create_missing_values(dup_sample["sex"], rate=0.03)
    
    status_dup_sample = pd.merge(dup_sample, status_df, on=["person_id"], how="left")

    status_dup_sample["address"] = create_missing_values(status_dup_sample["address"], rate=0.3)

    dup_addresses = (
        status_dup_sample.loc[~status_dup_sample["address"].isna()]
        .groupby("unique_id")["address"]
        .agg(list)
    )

    dup_sa4s = (
        status_dup_sample.loc[~status_dup_sample["sa4"].isna()]
        .groupby("unique_id")["sa4"]
        .agg(list)
    )

    dup_aggregated = pd.merge(dup_sample, dup_addresses, on=["unique_id"], how="left")
    dup_aggregated = pd.merge(dup_aggregated, dup_sa4s, on=["unique_id"], how="left")

    return dup_aggregated


data = generate_data(n=100_000, n_family=40_000, n_address=80_000)

perturbed_data_l = perturb(data, drop_rate=0.0, dup_rate=0.03)
perturbed_data_l.to_parquet("./perturbed_data_l.parquet")

perturbed_data_r = perturb(data, drop_rate=0.1, dup_rate=0.03)
perturbed_data_r.to_parquet("./perturbed_data_r.parquet")

In [8]:
perturbed_data_l

Unnamed: 0,person_id,family_id,dob,sex,fname,sname,unique_id,address,sa4
0,76855,22936,1964-06-11,2,ETOA,ZXWEAJD,0,"[9614, 26433]","[91, 99, 54, 40]"
1,65525,33380,1997-10-02,1,VVPQ,PCCVGZQP,1,"[28525, 1226]","[99, 4, 23]"
2,28504,14685,1983-04-13,,EXJUFYE,QTYHEBTF,2,[4372],[41]
3,55743,16002,1975-03-21,2,WVIYN,ENAUZBMQJY,3,,
4,39801,10242,1973-07-23,1,EXJUFYE,XLUSPYKN,4,[34926],[34]
...,...,...,...,...,...,...,...,...,...
92695,89671,1849,1965-06-07,2,GGDZBF,RBWZXVHG,92695,"[36470, 18186, 19682]","[5, 1, 78, 49]"
92696,39505,5041,1953-08-15,2,EMVWKV,SYQIVQCWG,92696,,
92697,4426,8883,2001-07-23,2,DTHXNC,XMMXGUWZD,92697,"[16374, 37937]","[8, 57]"
92698,39064,31483,1985-03-14,1,VVPQI,MEBJMPQRF,92698,[28104],"[36, 38, 99]"
