In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from scipy.spatial import KDTree

Need to think carefully about what distance metric we use.

## Metric 1: IMS

Assess the proportion of records in both the synthetic and holdout data sets that are identical to records in the training data set.

Import the holdout data.

In [2]:
holdout = pd.read_csv("Data/synthetic_datasets/synthetic_holdouts/val_dataset.csv")

In [3]:
holdout["age"] = holdout["age"].astype('category')

In [4]:
age_dummies = pd.get_dummies(holdout['age'], drop_first=True)

In [5]:
age_dummies.columns = [str(x) for x in age_dummies.columns]

In [6]:
holdout = holdout.drop("age", axis=1)

In [7]:
holdout = pd.concat([holdout, age_dummies], axis=1)

In [8]:
holdout

Unnamed: 0,latitude,longitude,sex,state,1,2,3,4,5,6,7,8,9
0,0.040702,-0.529636,0,0,0,0,0,1,0,0,0,0,0
1,0.686755,-0.572415,0,0,0,0,1,0,0,0,0,0,0
2,-1.982437,-0.170191,0,0,0,1,0,0,0,0,0,0,0
3,0.724957,-0.828070,1,0,0,1,0,0,0,0,0,0,0
4,0.642475,0.303828,1,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3351,0.775717,-1.004152,1,0,0,0,1,0,0,0,0,0,0
3352,0.039923,-0.591297,0,0,0,1,0,0,0,0,0,0,0
3353,0.034703,-1.312438,1,0,0,0,0,0,1,0,0,0,0
3354,-1.774746,-0.800768,1,0,0,0,0,1,0,0,0,0,0


Import the training data.

In [9]:
training = pd.read_csv("Data/synthetic_datasets/synthetic_holdouts/train_dataset.csv")

In [10]:
training["age"] = training["age"].astype('category')

In [11]:
age_dummies = pd.get_dummies(training['age'], drop_first=True)

In [12]:
age_dummies.columns = [str(x) for x in age_dummies.columns]

In [13]:
training = training.drop("age", axis=1)

In [14]:
training = pd.concat([training, age_dummies], axis=1)

In [15]:
training

Unnamed: 0,latitude,longitude,sex,state,1,2,3,4,5,6,7,8,9
0,0.067996,-0.538542,0,0,0,0,0,0,0,1,0,0,0
1,0.811432,-0.566841,0,0,0,1,0,0,0,0,0,0,0
2,-0.111521,0.898761,1,0,1,0,0,0,0,0,0,0,0
3,-1.787171,-0.810709,1,0,0,0,1,0,0,0,0,0,0
4,-2.216792,0.125990,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3351,0.025973,-0.706528,0,0,0,0,0,1,0,0,0,0,0
3352,0.258134,-0.581117,1,0,0,0,0,0,1,0,0,0,0
3353,0.642475,0.303828,1,0,0,0,0,0,0,0,1,0,0
3354,-1.733209,-0.922612,0,0,0,0,0,1,0,0,0,0,0


Pick an arbitrary $\delta$ value.

In [16]:
delta = 0.001

Function to calculate the identical match share for given training, holdout, and synthetic data, for a given delta value.

In [17]:
def privacy_metrics(train_data, holdout_data, synthetic_data, delta):
    
    synthetic_data["age"] = synthetic_data["age"].astype('category')
    
    age_dummies = pd.get_dummies(synthetic_data['age'], drop_first=True)
    
    age_dummies.columns = [str(x) for x in age_dummies.columns]
    
    synthetic_data = synthetic_data.drop("age", axis=1)
    
    synthetic_data = pd.concat([synthetic_data, age_dummies], axis=1)
    
    missing = train_data.columns[[x not in synthetic_data.columns for x in train_data.columns]]
    
    if len(missing) > 0:
        for m in missing:
            new = pd.Series(np.zeros(synthetic_data.shape[0]))
            new.name = m
            synthetic_data = pd.concat([synthetic_data, new], axis=1)
            
    # print(synthetic_data)
    
    # print(training.columns[train_data.columns is in synthetic_data.columns])
    
    ###########################################
    
    # scaler = StandardScaler()
    
    # train_data_scaled = scaler.fit_transform(X=train_data) ####
    
    training_tree = KDTree(train_data) ####
    
    # holdout_data_scaled = scaler.fit(X=train_data).transform(X=holdout_data) ####
    
    # synthetic_data_scaled = scaler.fit(X=train_data).transform(X=synthetic_data) ####
    
    holdout_dists, holdout_neighbors = training_tree.query(x=holdout_data, k=5, p=2) ####
    
    synthetic_dists, synthetic_neighbors = training_tree.query(x=synthetic_data, k=5, p=2) ####
    
    IMS_holdout = np.mean(holdout_dists[:,0] <= delta)
    
    IMS_synthetic = np.mean(synthetic_dists[:,0] <= delta)
    
    DCR_holdout = np.percentile(holdout_dists[:,0], q=5)
    
    DCR_synthetic = np.percentile(synthetic_dists[:,0], q=5)
    
    ratios_synthetic = synthetic_dists[:,0]/synthetic_dists[:,-1]
    
    ratios_holdout = holdout_dists[:,0]/holdout_dists[:,-1]
    
    ratios_holdout = np.nan_to_num(ratios_holdout, nan=1.0)
    
    NNDR_synthetic = np.percentile(ratios_synthetic, q=5)
    
    NNDR_holdout = np.percentile(ratios_holdout, q=5)
    
    ### share calculation
    
    # distance between synthetic and holdout
    
    # holdout_data_scaled = scaler.fit_transform(X=holdout_data) ####
    
    holdout_tree = KDTree(holdout_data) ####
    
    trn_dists, _ = training_tree.query(x=synthetic_data, k=1, p=2) ####
    
    # synthetic_data_scaled = scaler.fit(X=holdout_data).transform(X=synthetic_data) ####
    
    hld_dists, _ = holdout_tree.query(x=synthetic_data, k=1, p=2) ####
    
    closer_syn = np.mean(trn_dists < hld_dists) + (train_data.shape[0]/(train_data.shape[0]+holdout_data.shape[0])) * np.mean(trn_dists == hld_dists)
    
    return ({"IMS_holdout": IMS_holdout, "IMS_synthetic": IMS_synthetic,
             "DCR_holdout": DCR_holdout, "DCR_synthetic": DCR_synthetic,
             "NNDR_holdout": NNDR_holdout, "NNDR_synthetic": NNDR_synthetic,
             "Share Closer Train": closer_syn})

Calculate the IMS of all synthetic data sets for MNL, CART, and MOSTLY.AI.

In [18]:
sXs_mnl = [pd.read_csv("Data/synthetic_datasets/synthetic_holdouts/synthetic_dataset_" + str(i) + ".csv") for i in range(20)]

In [19]:
sXs_cart = [pd.read_csv("Data/synthetic_datasets/synthetic_holdouts/cart_synthetic_dataset_" + str(i) + ".csv") for i in range(20)]

In [20]:
sXs_mostly = [pd.read_csv("Data/synthetic_datasets/synthetic_holdouts/mostly_synthetic_train_" + str(i) + ".csv") for i in range(20)]

In [21]:
def mean_privacy_diffs(training_data, holdout_data, synthetic_datasets, delta):
    
    metric_list = [privacy_metrics(training_data, holdout_data, Y, delta=delta) for Y in synthetic_datasets]
    
    return {"IMS Diff (Synthetic - Holdout)": np.mean([x['IMS_synthetic']-x['IMS_holdout'] for x in metric_list]),
            "DCR Diff (Synthetic - Holdout)": np.mean([x['DCR_synthetic']-x['DCR_holdout'] for x in metric_list]),
            "NNDR Diff (Synthetic - Holdout)": np.mean([x['NNDR_synthetic']-x['NNDR_holdout'] for x in metric_list]),
            "Share Closer Train": np.mean([x["Share Closer Train"] for x in metric_list])}

In [22]:
temp = [sXs_mnl, sXs_cart, sXs_mostly]

[mean_privacy_diffs(training, holdout, x, delta) for x in temp]

  ratios_holdout = holdout_dists[:,0]/holdout_dists[:,-1]


[{'IMS Diff (Synthetic - Holdout)': -0.478888557806913,
  'DCR Diff (Synthetic - Holdout)': 0.0009968435336302319,
  'NNDR Diff (Synthetic - Holdout)': 0.023793366779018374,
  'Share Closer Train': 0.5210667461263407},
 {'IMS Diff (Synthetic - Holdout)': -0.46114421930870086,
  'DCR Diff (Synthetic - Holdout)': 0.0008343192662079984,
  'NNDR Diff (Synthetic - Holdout)': 0.02218945478815688,
  'Share Closer Train': 0.5399657330154948},
 {'IMS Diff (Synthetic - Holdout)': -0.39926996424314665,
  'DCR Diff (Synthetic - Holdout)': 8.740438287301148e-09,
  'NNDR Diff (Synthetic - Holdout)': 0.024656473517947985,
  'Share Closer Train': 0.5296930870083432}]