In [17]:
import pandas as pd
import numpy as np
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

from sklearn.tree import DecisionTreeClassifier

from bayes_opt import BayesianOptimization

import matplotlib.pyplot as plt

from numpy.random import default_rng

from scipy.spatial import KDTree

import time

rng = default_rng()

In [18]:
def cart_pmse_ratio(original_data, synthetic_data_sets):
    
    ### get pmse measures for synthetic data set pairs
    
    null_pmse_list = []
    
    inds = np.arange(len(synthetic_data_sets))
    
    combo_list = list(itertools.combinations(inds, 2))
    
    for combo in combo_list:
    
        s1 = synthetic_data_sets[combo[0]]
        s2 = synthetic_data_sets[combo[1]]
    
        N_orig = s1.shape[0]
        N_synth = s2.shape[0]
    
        c = N_synth/(N_synth+N_orig)
    
        full_X = pd.concat([s1, s2], axis=0).reset_index(drop=True)
    
        full_X = preprocessing.StandardScaler().fit_transform(full_X)

        y = np.repeat([0, 1], repeats=[N_orig, N_synth])
    
        cart = DecisionTreeClassifier(min_samples_split=20,
                                      min_samples_leaf=int(np.round(20/3)),
                                      ccp_alpha=0.0001,
                                      max_depth=30)
    
        cart.fit(X=full_X, y=y)
    
        probs = cart.predict_proba(full_X)
    
        pMSE = 1/(N_synth + N_orig) * np.sum((probs[:,1] - c)**2)
        
        null_pmse_list.append(pMSE)
    
    pmse_list = []
    
    for s in synthetic_data_sets:
    
        N_orig = original_data.shape[0]
        N_synth = s.shape[0]
    
        c = N_synth/(N_synth+N_orig)
    
        full_X = pd.concat([original_data, s], axis=0).reset_index(drop=True)
    
        full_X = preprocessing.StandardScaler().fit_transform(full_X)

        y = np.repeat([0, 1], repeats=[N_orig, N_synth])
    
        cart = DecisionTreeClassifier(min_samples_split=20,
                                      min_samples_leaf=int(np.round(20/3)),
                                      ccp_alpha=0.0001,
                                      max_depth=30)
    
        cart.fit(X=full_X, y=y)
    
        probs = cart.predict_proba(full_X)
    
        pMSE = 1/(N_synth + N_orig) * np.sum((probs[:,1] - c)**2)
        
        pmse_list.append(pMSE)
    
    return np.array(pmse_list)/np.mean(null_pmse_list)

In [19]:
def pmse_ratio(original_data, synthetic_data):
    
    N_synth = synthetic_data.shape[0]
    N_orig = original_data.shape[0]
    
    # combine original and synthetic datasets
    full_X = pd.concat([original_data, synthetic_data], axis=0).reset_index(drop=True)
    
    # generate interactions and powers of variables
    poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
    
    full_X = poly.fit_transform(full_X)

    # scale the combined dataset
    full_X = preprocessing.StandardScaler().fit_transform(full_X)
    
    c = N_synth/(N_synth+N_orig)

    y = np.repeat([0, 1], repeats=[N_orig, N_synth])
    
    pMSE_model = LogisticRegression(penalty='none', max_iter=1000).fit(full_X, y)
    
    probs = pMSE_model.predict_proba(full_X)
    
    pMSE = 1/(N_synth+N_orig) * np.sum((probs[:,1] - c)**2)
    
    e_pMSE = 2*(full_X.shape[1])*(1-c)**2 * c/(N_synth+N_orig)
        
    return pMSE/e_pMSE

In [20]:
def privacy_metrics(train_data, holdout_data, synthetic_data, delta):
    
    training_tree = KDTree(train_data)
    
    holdout_dists, holdout_neighbors = training_tree.query(x=holdout_data, k=5, p=2)
    
    synthetic_dists, synthetic_neighbors = training_tree.query(x=synthetic_data, k=5, p=2)
    
    IMS_holdout = np.mean(holdout_dists[:,0] <= delta)
    
    IMS_synthetic = np.mean(synthetic_dists[:,0] <= delta)
    
    DCR_holdout = np.percentile(holdout_dists[:,0], q=5)
    
    DCR_synthetic = np.percentile(synthetic_dists[:,0], q=5)
    
    ratios_synthetic = synthetic_dists[:,0]/synthetic_dists[:,-1]
    
    ratios_holdout = holdout_dists[:,0]/holdout_dists[:,-1]
    
    NNDR_synthetic = np.percentile(ratios_synthetic, q=5)
    
    NNDR_holdout = np.percentile(ratios_holdout, q=5)
    
    ### share calculation
    
    # distance between synthetic and holdout 
    holdout_tree = KDTree(holdout_data)
    
    trn_dists, _ = training_tree.query(x=synthetic_data, k=1, p=2)
    hld_dists, _ = holdout_tree.query(x=synthetic_data, k=1, p=2)
    
    closer_syn = np.mean(trn_dists < hld_dists) + (train_data.shape[0]/(train_data.shape[0]+holdout_data.shape[0])) * np.mean(trn_dists == hld_dists)
    
    return ({"IMS_holdout": IMS_holdout, "IMS_synthetic": IMS_synthetic,
             "DCR_holdout": DCR_holdout, "DCR_synthetic": DCR_synthetic,
             "NNDR_holdout": NNDR_holdout, "NNDR_synthetic": NNDR_synthetic,
             "Share Closer Train": closer_syn})

# Simulation 1: Privacy Metrics When Data Come From The Same Sampling Distribution

Structure simulation the same way as Snoke et al. 2018. 

* 1000 simulations
* For each simulation
    * generate 10 data sets, each of size $N = 5000$
        * each data is multivariate normal with $\mu = 0$ and $\sigma^2 = 1$ with covariances equal to 0, 0.1, ..., 0.9 for each data set.

In [21]:
def privacy_simulation(sample_size, covariance_values, nreps):
    
    m = np.zeros((10))
    diag = np.repeat(1.0, 10)
    
    results_dict = {}
    
    for v in covariance_values:
        
        covmat = np.zeros((10, 10))
        covmat[:,:] = v
        np.fill_diagonal(covmat, diag)
        
        results_dict[str(v)] = []
        
        for r in range(nreps):
        
            training = pd.DataFrame(rng.multivariate_normal(mean=m, cov=covmat, size=sample_size))
            holdout = [pd.DataFrame(rng.multivariate_normal(mean=m, cov=covmat, size=sample_size)) for i in range(10)]
            synthetic = [pd.DataFrame(rng.multivariate_normal(mean=m, cov=covmat, size=sample_size)) for i in range(10)]
            
            metrics = privacy_metrics(training, holdout[0], synthetic[0], delta=1)
            
            metrics['pmse_th'] = pmse_ratio(training, holdout[0])
            metrics['pmse_ts'] = pmse_ratio(training, synthetic[0])
            metrics['pmse_hs'] = pmse_ratio(holdout[0], synthetic[0])
            
            metrics['cart_pmse_th'] = cart_pmse_ratio(training, holdout)
            metrics['cart_pmse_ts'] = cart_pmse_ratio(training, synthetic)
            metrics['cart_pmse_hs'] = cart_pmse_ratio(holdout[0], synthetic)
            
            results_dict[str(v)].append(metrics)
            
    return results_dict

In [22]:
reps = 100
sample_size = 5000
cov_vals = np.linspace(start=0.0, stop=0.9, num=10)

In [23]:
tic = time.perf_counter()
privacy_results = privacy_simulation(sample_size=sample_size, covariance_values=cov_vals, nreps=reps)
toc = time.perf_counter()
print(f"Completed in {(toc - tic)/60:0.4f} minutes")

Completed in 515.9475 minutes


In [10]:
print(f"Completed in {(toc - tic)/60:0.4f} minutes")

Completed in 5.2154 minutes


In [24]:
print("Ratio of Mean Holdout IMS to Mean Synthetic IMS for Each Covariance Value")
np.array([np.mean([x['IMS_holdout'] for x in privacy_results[str(i)]]) for i in cov_vals])/[np.mean([x['IMS_synthetic'] for x in privacy_results[str(i)]]) for i in cov_vals]

Ratio of Mean Holdout IMS to Mean Synthetic IMS for Each Covariance Value


array([1.00842498, 1.0002244 , 1.00879434, 0.99847384, 0.99611855,
       0.99706426, 0.99968814, 1.00174039, 1.00019728, 0.99968218])

In [25]:
print("Ratio of Mean Holdout DCR to Mean Synthetic DCR for Each Covariance Value")
np.array([np.mean([x['DCR_holdout'] for x in privacy_results[str(i)]]) for i in cov_vals])/np.array([np.mean([x['DCR_synthetic'] for x in privacy_results[str(i)]]) for i in cov_vals])

Ratio of Mean Holdout DCR to Mean Synthetic DCR for Each Covariance Value


array([0.99812798, 1.00019294, 0.99901501, 1.00062875, 1.00066774,
       1.00060701, 0.99898692, 0.99849151, 0.99863952, 1.00131152])

In [26]:
print("Ratio of Mean Holdout NNDR to Mean Synthetic DCR for Each Covariance Value")
np.array([np.mean([x['NNDR_holdout'] for x in privacy_results[str(i)]]) for i in cov_vals])/np.array([np.mean([x['NNDR_synthetic'] for x in privacy_results[str(i)]]) for i in cov_vals])

Ratio of Mean Holdout NNDR to Mean Synthetic DCR for Each Covariance Value


array([0.99866955, 1.00103444, 0.99861143, 0.99926913, 1.00109817,
       0.99959567, 0.99914713, 0.99860656, 0.99927854, 1.00118629])

In [27]:
print("Mean pMSE Ratio for Training vs. Holdout")
[np.mean([x['pmse_th'] for x in privacy_results[str(i)]]) for i in cov_vals]

Mean pMSE Ratio for Training vs. Holdout


[1.021794951340699,
 0.9611643718916993,
 1.0020765906925302,
 0.9753561627020999,
 1.0250464073469683,
 1.001985670646797,
 1.0274314530830706,
 1.0143000820138628,
 1.0286703490698246,
 1.0088222923133487]

In [28]:
print("Mean pMSE Ratio for Training vs. Synthetic")
[np.mean([x['pmse_ts'] for x in privacy_results[str(i)]]) for i in cov_vals]

Mean pMSE Ratio for Training vs. Synthetic


[0.9755757746995183,
 0.9742822937106407,
 1.0122815334095276,
 0.9946327227077195,
 1.0196226761542337,
 1.0044254635234113,
 0.9935973556040235,
 1.0037880074701937,
 0.992979016150962,
 0.9928120047801562]

In [29]:
print("Mean pMSE Ratio for Holdout vs. Synthetic")
[np.mean([x['pmse_hs'] for x in privacy_results[str(i)]]) for i in cov_vals]

Mean pMSE Ratio for Holdout vs. Synthetic


[0.9669063778458379,
 1.0058026172031715,
 0.9783201986052351,
 1.0037883243117387,
 1.0088287594068903,
 0.9974754412246799,
 1.002802581781001,
 0.9602034251685626,
 1.0028266280425011,
 1.0000617806184948]

In [30]:
print("Mean CART pMSE Ratio for Training vs. Holdout")
[np.mean([x['cart_pmse_th'] for x in privacy_results[str(i)]]) for i in cov_vals]

Mean CART pMSE Ratio for Training vs. Holdout


[0.9971905223517445,
 1.0005184458465375,
 1.007218424915615,
 1.0067930184103588,
 0.9975248182937658,
 1.0039396679180812,
 0.9992660199578481,
 0.9970495256828041,
 0.9941106167069336,
 1.0003347134865082]

In [31]:
print("Mean CART pMSE Ratio for Training vs. Synthetic")
[np.mean([x['cart_pmse_ts'] for x in privacy_results[str(i)]]) for i in cov_vals]

Mean CART pMSE Ratio for Training vs. Synthetic


[1.0002019840638685,
 0.9971987520016865,
 1.0029586484866901,
 1.000903979174189,
 0.9991779188067469,
 1.0032604367054823,
 1.0038372089513192,
 1.0007655154507895,
 0.9829518038550807,
 0.9962852377475268]

In [32]:
print("Mean CART pMSE Ratio for Holdout vs. Synthetic")
[np.mean([x['cart_pmse_hs'] for x in privacy_results[str(i)]]) for i in cov_vals]

Mean CART pMSE Ratio for Holdout vs. Synthetic


[0.9935539722683945,
 0.9940674448846606,
 0.9995756981138811,
 0.9979176368546354,
 1.011564346312473,
 1.0128866240759833,
 1.0096695339950932,
 0.9919765014969996,
 1.0000368666105877,
 1.0022959563714158]

***

### Simulation 2: Test Whether Bayesian Optimization Can Recover the Correct Parameter

Steps:

* sample data from normal distribution - call it train
* use bayesian optimization to choose mean and covariance parameters, using geometric mean of expected pmse ratio as objective
* check whether bayesian optimization found the correct parameters, and what the resulting pmse ratio is

The first simulation has mean $(0, 0)$ and diagonal covariance matrix with variances of $(1, 1)$.

In [None]:
def synthesis_model(train_data, number_synthetic_datasets, m1_, m2_, v1_, v2_):
    
    # number of samples to draw
    num_samples = train_data.shape[0]
    
    mean_vec = np.array([m1_, m2_])
    covariance_mat = np.zeros((2, 2))
    covmat_diag = np.array([v1_, v2_])
    np.fill_diagonal(covariance_mat, covmat_diag)
    
    sXs = [pd.DataFrame(rng.multivariate_normal(mean=mean_vec, cov=covariance_mat, size=num_samples)) for i in range(number_synthetic_datasets)]
        
    ###### Calculate pMSE ratios ######
    pmse_ratios = [pmse_ratio(train_data, Y) for Y in sXs]
    
    return pmse_ratios

In [None]:
def optimize_synthesis(train_data, number_synthetic_datasets):

    def evaluate_model(m1_, m2_, v1_, v2_):

        pmse_ratios = synthesis_model(train_data=train_data,
                                      number_synthetic_datasets=number_synthetic_datasets,
                                      m1_=m1_,
                                      m2_=m2_,
                                      v1_=v1_,
                                      v2_=v2_)

        return -np.mean([(1 - x)**2 for x in pmse_ratios])

    optimizer = BayesianOptimization(
        f=evaluate_model,
        pbounds={
            "m1_": (-1, 1),
            "m2_": (-1, 1),
            "v1_": (0.9, 1.1),
            "v2_": (0.9, 1.1)
        })

    optimizer.maximize(acq='ei', xi=1e-2, n_iter=50)
    print("Final Result: ", optimizer.max)
    return optimizer.max, optimizer

In [None]:
M = 20

m = np.zeros((2))
covmat = np.zeros((2, 2))
diag = np.repeat(1.0, 2)
np.fill_diagonal(covmat, diag)

In [None]:
sample = pd.DataFrame(rng.multivariate_normal(mean=m, cov=covmat, size=sample_size))

In [None]:
optimization_results = [optimize_synthesis(train_data=sample, number_synthetic_datasets=M) for r in range(5)]

In [None]:
run_targets = [np.minimum.accumulate(-i[1].space.target) for i in optimization_results]

In [None]:
plt.plot(run_targets[0])
plt.scatter(np.arange(len(run_targets[0])), run_targets[0], s=6)
plt.plot(run_targets[1])
plt.scatter(np.arange(len(run_targets[1])), run_targets[1], s=6)
plt.plot(run_targets[2])
plt.scatter(np.arange(len(run_targets[2])), run_targets[2], s=6)
plt.plot(run_targets[3])
plt.scatter(np.arange(len(run_targets[3])), run_targets[3], s=6)
plt.plot(run_targets[4])
plt.scatter(np.arange(len(run_targets[4])), run_targets[4], s=6)
plt.show()

In [None]:
best_params = optimization_results[np.argmax([x[0]['target'] for x in optimization_results])][0]

In [None]:
best_params

In [None]:
ratios = synthesis_model(train_data=sample,
                         number_synthetic_datasets=100,
                         m1_=best_params['params']['m1_'],
                         m2_=best_params['params']['m2_'],
                         v1_=best_params['params']['v1_'],
                         v2_=best_params['params']['v2_'])

In [None]:
# ratios = synthesis_model(train_data=sample,
#                          number_synthetic_datasets=100,
#                          m1_=0.0,
#                          m2_=0.0,
#                          v1_=1.0,
#                          v2_=1.0)

In [None]:
np.mean(ratios)