In [None]:
import numpy as np
import pandas as pd
import surgeo
import matplotlib.pyplot as plt
import json

In [None]:
# load the zip-to-zip distance matrix
distances = np.load('zip_to_zip_distances.npy')

In [None]:
# load the zctas index
zctas = json.loads(open('zctas.json').read())

In [None]:
def get_zip_in_range(zcta, zctas, distances, dist_min, dist_max):
    index = zctas.index(zcta)
    candidates = np.nonzero((distances[index, :] >= dist_min) & (distances[index, :] <= dist_max))[0]
    
    if len(candidates) == 0:
        return zcta  # No candidates, return the original zcta without changing
    
    return zctas[np.random.choice(candidates)]

def get_zip_with_error(zcta, zctas, distances, err_distances, err_probs):
    dist_max_idx = np.random.choice(np.arange(len(err_distances)), p=err_probs)
    dist_max = err_distances[dist_max_idx]
    if dist_max_idx == 0:
        dist_min = 0
    else:
        dist_min = err_distances[dist_max_idx-1]
    return get_zip_in_range(zcta, zctas, distances, dist_min, dist_max)

In [None]:
dist = list(np.linspace(5, 100, 20)) + [100]
probs = [.59, .63, .68, .72, .76, .78, .81, .83, .85, .86, .87, .88, .89, .895, .9, .905, .91, .9125, .925, .93, 1]
p = [probs[0]]
for idx in range(1, len(probs)):
    p.append(probs[idx]-probs[idx-1])

In [None]:
voters = pd.read_csv('data.csv') # this is the filtered data with 6975377 obs
voters['zcta'] = voters['zcta'].astype(int).astype(str)

In [None]:
import numpy as np
import pandas as pd

def perturb_zipcodes_batch(voters, zctas, distances, dist, p, perturb_ratio=0.05, random_state=42):
    np.random.seed(random_state)

    n_total = len(voters)
    n_perturb = int(np.floor(perturb_ratio * n_total))

    voters = voters.copy()
    voters['Swapped_zcta'] = voters['zcta']

    # Step 1: Randomly pre-select 5% of voters
    selected_idx = np.random.choice(voters.index, n_perturb, replace=False)

    # Step 2: Try to perturb all selected voters at once
    successful_perturb = []
    for idx in selected_idx:
        original_zcta = voters.loc[idx, 'zcta']
        swapped = get_zip_with_error(original_zcta, zctas, distances, dist, p)

        if swapped != original_zcta:
            voters.at[idx, 'Swapped_zcta'] = swapped
            successful_perturb.append(idx)
    
    # Step 3: If not enough perturbations succeeded, supplement more
    while len(successful_perturb) < n_perturb:
        needed = n_perturb - len(successful_perturb)
        
        available_idx = list(set(voters.index) - set(successful_perturb))
        new_selected = np.random.choice(available_idx, needed, replace=False)
        
        for idx in new_selected:
            original_zcta = voters.loc[idx, 'zcta']
            swapped = get_zip_with_error(original_zcta, zctas, distances, dist, p)
            if swapped != original_zcta:
                voters.at[idx, 'Swapped_zcta'] = swapped
                successful_perturb.append(idx)

    return voters

In [None]:
voters1 = perturb_zipcodes_batch(voters, zctas, distances, dist, p, perturb_ratio=0.2, random_state=123)

In [None]:
voters1.to_csv("data_zp_20.csv", index=False)