# Code for Assessing Privacy Risk Using the Attack from Ponte et al. (2024)

https://github.com/GilianPonte/whereswaldoIJRM

In [64]:
import pandas as pd
import numpy as np
import csv
import os
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
%matplotlib inline

# Add the parent directory to path
sys.path.append('..')

# Then import
from helper_functions import *

Import a subset of the Criteo dataset.

## Note

If you are synthesizing variables out of the order in which they appear in the data, you need to re-order the initial training data to match that order.

## End Note

In [65]:
train_data = pd.read_csv("../../Data/Criteo/cleaned_criteo_small.gz",
                         compression='gzip', 
                         sep='\,',
                         header=0,
                         engine='python')

Drop duplicates and reset index.

In [66]:
train_data = pd.DataFrame.drop_duplicates(train_data)
# reset dataframe index
train_data.reset_index(drop=True, inplace=True)
train_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure
0,2.954807,2.368327,2.178833,1.480282,2.330251,20.349663,0.000205,2.100938,48.158740,2.579463,1.667778,0.84478,1,0,0,0
1,3.100481,2.308533,2.105887,107.757316,2.330251,61.279994,0.275765,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
2,2.945821,2.308533,2.139045,9.914112,2.330251,61.279994,0.003786,1.575636,46.910867,3.486706,1.667778,0.84478,0,0,0,0
3,3.177230,2.308533,2.105887,107.757316,2.330251,61.279994,0.089715,1.575636,53.083067,2.579463,1.667778,0.84478,0,0,0,0
4,3.274234,2.308533,2.105887,107.757316,2.330251,61.279994,0.000003,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99904,3.037003,2.308533,2.184994,107.757316,2.330251,61.279994,0.275765,1.575636,47.003884,2.579463,1.667778,0.84478,1,0,0,0
99905,3.286020,2.308533,2.105887,107.757316,2.330251,61.279994,0.275765,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
99906,3.285632,2.308533,2.105887,107.757316,2.330251,61.279994,0.275765,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
99907,2.534995,2.308533,2.108605,107.757316,2.330251,61.279994,1.342378,1.575636,43.484412,3.572938,1.667778,0.84478,1,0,0,0


## Privacy Attack Simulation

The steps are as follows:

- Split the data three ways:
    - Marketer training data
    - Adversary training data
    - Outside data
- Train Marketer and Adversary synthesis models
- Compute predictions for distribution membership for outside data and compute empirical epsilon
- Repeat the above steps many times (100 iterations in Ponte et al. 2024)

Function to split into train and test sets while ensuring that the training data has an even number of rows.

In [67]:
def train_test_split_even(X, train_size, random_state=None):
    # Split the data normally
    X_train, X_test = train_test_split(
        X, train_size=train_size, random_state=random_state
    )
    
    # If train set has odd number of rows
    if len(X_train) % 2 != 0:
        # Move the last row from train to test
        X_test = pd.concat([X_test, X_train[-1:]], axis=0)
        X_train = X_train[:-1]
    
    return X_train, X_test

Parameters for synthesis models.

In [68]:
# synthesis steps
# written as a list of tuples (features, model)
synthesis_steps = [
    (["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11"], 'gmm'),
    ('treatment', 'multinomial'),
    ('exposure', 'multinomial'),
    ('visit', 'multinomial'),
    ('conversion', 'multinomial')
]

In [69]:
# parameter bounds
param_bounds = {
    'gmm': {
        'num_components': (10, 50.99),
    },
    'multinomial': {
        'treatment': {'C': (0.001, 3)},
        'exposure': {'C': (0.001, 3)},
        'visit': {'C': (0.001, 3)},
        'conversion': {'C': (0.001, 3)}
    }
}

In [70]:
number_synthetic_datasets = 10
num_iter_optimization = 5
num_init_optimization = 2
random_states = [1006, 428]
poly_degree_mnl = 2
poly_degree_pmse = 2
interaction_only = True
gmm_n_init = 3
covariance_type = "diag"

#### Set Everything Up in a Loop

In [71]:
num_obs = [3000, 30000]
num_simulations = 2
epsilons = {}
seed = 42

Order training data variables based on synthesis order.

In [72]:
var_names = []
for item in synthesis_steps:
    name = item[0]
    if type(name) == list:
        for i in name:
            var_names.append(i)
    else:
        var_names.append(name)


In [73]:
train_data = train_data[var_names]
train_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,exposure,visit,conversion
0,2.954807,2.368327,2.178833,1.480282,2.330251,20.349663,0.000205,2.100938,48.158740,2.579463,1.667778,0.84478,1,0,0,0
1,3.100481,2.308533,2.105887,107.757316,2.330251,61.279994,0.275765,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
2,2.945821,2.308533,2.139045,9.914112,2.330251,61.279994,0.003786,1.575636,46.910867,3.486706,1.667778,0.84478,0,0,0,0
3,3.177230,2.308533,2.105887,107.757316,2.330251,61.279994,0.089715,1.575636,53.083067,2.579463,1.667778,0.84478,0,0,0,0
4,3.274234,2.308533,2.105887,107.757316,2.330251,61.279994,0.000003,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99904,3.037003,2.308533,2.184994,107.757316,2.330251,61.279994,0.275765,1.575636,47.003884,2.579463,1.667778,0.84478,1,0,0,0
99905,3.286020,2.308533,2.105887,107.757316,2.330251,61.279994,0.275765,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
99906,3.285632,2.308533,2.105887,107.757316,2.330251,61.279994,0.275765,1.575636,53.083067,2.579463,1.667778,0.84478,1,0,0,0
99907,2.534995,2.308533,2.108605,107.757316,2.330251,61.279994,1.342378,1.575636,43.484412,3.572938,1.667778,0.84478,1,0,0,0


In [74]:
for n in num_obs:

    current_data_sample, _ = train_test_split(train_data, train_size = n, stratify = train_data['conversion'], random_state=seed)

    epsilons[n] = []

    for i in range(num_simulations):

        # split data
        internal_data, external_data = train_test_split_even(current_data_sample, train_size=0.67, random_state=seed+i)
        marketer_train, adversary_train = train_test_split(internal_data, train_size=0.5)

        # marketer synthesis
        marketer_optimization_results = [optimize_models(train_data=marketer_train, 
                                                         number_synthetic_datasets=number_synthetic_datasets,
                                                         synthesis_steps=synthesis_steps,
                                                         param_bounds=param_bounds,
                                                         poly_degree_mnl=poly_degree_mnl,
                                                         poly_degree_pmse=poly_degree_pmse,
                                                         interaction_only=interaction_only,
                                                         covariance_type=covariance_type,
                                                         gmm_n_init=gmm_n_init,
                                                         random_state=r,
                                                         num_iter_optimization=num_iter_optimization,
                                                         num_init_optimization=num_init_optimization) for r in random_states]

        # adversary synthesis
        adversary_optimization_results = [optimize_models(train_data=adversary_train, 
                                                          number_synthetic_datasets=number_synthetic_datasets,
                                                          synthesis_steps=synthesis_steps,
                                                          param_bounds=param_bounds,
                                                          poly_degree_mnl=poly_degree_mnl,
                                                          poly_degree_pmse=poly_degree_pmse,
                                                          interaction_only=interaction_only,
                                                          covariance_type=covariance_type,
                                                          gmm_n_init=gmm_n_init,
                                                          random_state=r,
                                                          num_iter_optimization=num_iter_optimization,
                                                          num_init_optimization=num_init_optimization) for r in random_states]

        # store best params
        best_marketer_params = marketer_optimization_results[np.argmin([x['best_score'] for x in marketer_optimization_results])]['best_params']
        best_adversary_params = adversary_optimization_results[np.argmin([x['best_score'] for x in adversary_optimization_results])]['best_params']

        # train and generate with best params
        marketer_ratios, marketer_sXs = perform_synthesis(train_data=marketer_train,
                                                          number_synthetic_datasets=2,
                                                          poly_degree_mnl=poly_degree_mnl,
                                                          poly_degree_pmse=poly_degree_pmse,
                                                          interaction_only=interaction_only,
                                                          covariance_type=covariance_type,
                                                          gmm_n_init=gmm_n_init,
                                                          synthesis_steps=synthesis_steps,
                                                          param_values=best_marketer_params)

        adversary_ratios, adversary_sXs = perform_synthesis(train_data=adversary_train,
                                                            number_synthetic_datasets=2,
                                                            poly_degree_mnl=poly_degree_mnl,
                                                            poly_degree_pmse=poly_degree_pmse,
                                                            interaction_only=interaction_only,
                                                            covariance_type=covariance_type,
                                                            gmm_n_init=gmm_n_init,
                                                            synthesis_steps=synthesis_steps,
                                                            param_values=best_adversary_params)

        marketer_synthetic = marketer_sXs[0]
        adversary_synthetic = adversary_sXs[0]

        ### below code borrowed from Ponte et al. (2024)
        # to prevent a naive model
        N = len(marketer_train)/10

        # step 1, 2 from paper
        params = {"bandwidth": np.logspace(-1, 1, 20)} # vary the bandwith
        grid_marketer = GridSearchCV(KernelDensity(), params, n_jobs = -1) # cross validate for bandwiths
        grid_marketer.fit(marketer_train) # estimate pdf from train data.
        marketer_kde = grid_marketer.best_estimator_ # get best estimator

        grid_adversary = GridSearchCV(KernelDensity(), params, n_jobs = -1) # cross validate (CV)
        grid_adversary.fit(adversary_train) # estimate pdf from adversary data
        kde_adversary = grid_adversary.best_estimator_ # get best estimator from CV

        density_marketer = marketer_kde.score_samples(marketer_train) # score train examples from train on pdf_train
        density_adversary = kde_adversary.score_samples(adversary_train) # score train examples from train on pdf_adversary
        TPR = sum(density_marketer > density_adversary)/len(density_marketer) # calculate TPR

        density_marketer_new = marketer_kde.score_samples(external_data) # score eval_outside examples on train density
        density_adversary_new = kde_adversary.score_samples(external_data) # score eval_outside examples on adversary density
        FPR = sum(density_marketer_new > density_adversary_new)/len(density_marketer_new) # calculate FPR
        TNR = 1 - FPR
        FNR = 1 - TPR
        print(TPR)
        print(FPR)
        print(TNR)
        print(FNR)
        epsilons[n].append(max(math.log((1 - (1/N) - FPR)/FNR), math.log((1 - (1/N) - FNR)/FPR))) # append resulting epsilon to epsilons

    
    

|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------




| [39m1        [39m | [39m-0.461810[39m | [39m40.460790[39m | [39m0.6800465[39m | [39m1.9279589[39m | [39m2.6471840[39m | [39m2.2223556[39m |




| [35m2        [39m | [35m-0.416391[39m | [35m39.564593[39m | [35m0.8867481[39m | [35m1.5770236[39m | [35m0.5221198[39m | [35m2.4360656[39m |




| [39m3        [39m | [39m-0.664780[39m | [39m18.280412[39m | [39m2.2337832[39m | [39m2.1843859[39m | [39m2.4964602[39m | [39m0.0150037[39m |




| [39m4        [39m | [39m-3.298231[39m | [39m36.168546[39m | [39m3.0      [39m | [39m0.001    [39m | [39m0.001    [39m | [39m3.0      [39m |




| [39m5        [39m | [39m-0.416961[39m | [39m40.882834[39m | [39m0.0861443[39m | [39m2.1851386[39m | [39m0.8359251[39m | [39m2.2135232[39m |




| [39m6        [39m | [39m-0.417994[39m | [39m41.450407[39m | [39m2.4949743[39m | [39m2.8953588[39m | [39m0.8006439[39m | [39m2.5048494[39m |




| [39m7        [39m | [39m-3.094496[39m | [39m41.569941[39m | [39m1.6702406[39m | [39m1.1999937[39m | [39m0.6806913[39m | [39m0.001    [39m |
|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------




| [39m1        [39m | [39m-0.434675[39m | [39m43.547203[39m | [39m2.6233683[39m | [39m1.2559684[39m | [39m1.3362145[39m | [39m0.3865150[39m |




| [35m2        [39m | [35m-0.361587[39m | [35m14.301984[39m | [35m1.9387171[39m | [35m2.8864276[39m | [35m0.8729716[39m | [35m0.2011064[39m |




| [39m3        [39m | [39m-0.371820[39m | [39m13.950533[39m | [39m1.2166881[39m | [39m2.8086649[39m | [39m1.2480261[39m | [39m0.3766943[39m |




| [39m4        [39m | [39m-6.844688[39m | [39m16.883936[39m | [39m3.0      [39m | [39m3.0      [39m | [39m0.001    [39m | [39m0.001    [39m |




| [35m5        [39m | [35m-0.306220[39m | [35m12.893120[39m | [35m2.2569126[39m | [35m2.9212475[39m | [35m0.9504924[39m | [35m0.0798258[39m |




| [39m6        [39m | [39m-0.378320[39m | [39m13.520999[39m | [39m2.5296670[39m | [39m2.0820140[39m | [39m2.5448549[39m | [39m0.0325252[39m |




| [39m7        [39m | [39m-0.395038[39m | [39m13.211647[39m | [39m2.5803746[39m | [39m2.8456334[39m | [39m1.7438144[39m | [39m2.0325069[39m |
|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------




| [39m1        [39m | [39m-0.527244[39m | [39m40.460790[39m | [39m0.6800465[39m | [39m1.9279589[39m | [39m2.6471840[39m | [39m2.2223556[39m |




| [35m2        [39m | [35m-0.500731[39m | [35m39.564593[39m | [35m0.8867481[39m | [35m1.5770236[39m | [35m0.5221198[39m | [35m2.4360656[39m |




| [39m3        [39m | [39m-1.431779[39m | [39m18.280412[39m | [39m2.2337832[39m | [39m2.1843859[39m | [39m2.4964602[39m | [39m0.0150037[39m |




| [39m4        [39m | [39m-7.992303[39m | [39m49.340460[39m | [39m3.0      [39m | [39m0.001    [39m | [39m0.001    [39m | [39m3.0      [39m |




| [35m5        [39m | [35m-0.247066[39m | [35m36.150448[39m | [35m0.001    [39m | [35m2.6215126[39m | [35m3.0      [39m | [35m1.9222983[39m |




| [39m6        [39m | [39m-9.222095[39m | [39m36.430552[39m | [39m3.0      [39m | [39m0.001    [39m | [39m3.0      [39m | [39m0.001    [39m |




| [39m7        [39m | [39m-0.254677[39m | [39m38.270420[39m | [39m0.001    [39m | [39m2.6196930[39m | [39m2.1492751[39m | [39m2.5242594[39m |
|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------




| [39m1        [39m | [39m-0.543440[39m | [39m43.547203[39m | [39m2.6233683[39m | [39m1.2559684[39m | [39m1.3362145[39m | [39m0.3865150[39m |




| [35m2        [39m | [35m-0.324128[39m | [35m14.301984[39m | [35m1.9387171[39m | [35m2.8864276[39m | [35m0.8729716[39m | [35m0.2011064[39m |




| [39m3        [39m | [39m-0.358730[39m | [39m13.659154[39m | [39m1.6473253[39m | [39m2.9530245[39m | [39m0.5092848[39m | [39m0.3474625[39m |




| [39m4        [39m | [39m-8.208214[39m | [39m15.988636[39m | [39m2.7032482[39m | [39m2.7116799[39m | [39m1.8271544[39m | [39m0.001    [39m |




| [39m5        [39m | [39m-8.857220[39m | [39m13.371688[39m | [39m2.1100751[39m | [39m2.5232050[39m | [39m1.1943961[39m | [39m0.001    [39m |




| [35m6        [39m | [35m-0.314753[39m | [35m14.011160[39m | [35m1.7869248[39m | [35m2.9320281[39m | [35m0.6798983[39m | [35m0.2811850[39m |




| [35m7        [39m | [35m-0.295112[39m | [35m14.240482[39m | [35m1.5324770[39m | [35m3.0      [39m | [35m0.2973299[39m | [35m0.4724777[39m |




1.0
0.9717171717171718
0.028282828282828243
0.0
|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------


  epsilons[n].append(max(math.log((1 - (1/N) - FPR)/FNR), math.log((1 - (1/N) - FNR)/FPR))) # append resulting epsilon to epsilons


| [39m1        [39m | [39m-0.489067[39m | [39m40.460790[39m | [39m0.6800465[39m | [39m1.9279589[39m | [39m2.6471840[39m | [39m2.2223556[39m |




| [35m2        [39m | [35m-0.447320[39m | [35m39.564593[39m | [35m0.8867481[39m | [35m1.5770236[39m | [35m0.5221198[39m | [35m2.4360656[39m |




| [35m3        [39m | [35m-0.107506[39m | [35m18.280412[39m | [35m2.2337832[39m | [35m2.1843859[39m | [35m2.4964602[39m | [35m0.0150037[39m |




| [39m4        [39m | [39m-4.833452[39m | [39m12.496881[39m | [39m2.8864831[39m | [39m1.6226023[39m | [39m0.001    [39m | [39m0.0015321[39m |




| [35m5        [39m | [35m-0.023438[39m | [35m20.750070[39m | [35m1.9552015[39m | [35m2.4239386[39m | [35m3.0      [39m | [35m0.0210531[39m |




| [39m6        [39m | [39m-0.187706[39m | [39m20.387098[39m | [39m2.9595762[39m | [39m0.0458898[39m | [39m3.0      [39m | [39m2.6832280[39m |




| [35m7        [39m | [35m-0.015142[39m | [35m19.761863[39m | [35m0.001    [39m | [35m3.0      [39m | [35m3.0      [39m | [35m3.0      [39m |
|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------




| [39m1        [39m | [39m-0.477617[39m | [39m43.547203[39m | [39m2.6233683[39m | [39m1.2559684[39m | [39m1.3362145[39m | [39m0.3865150[39m |




| [35m2        [39m | [35m-0.403814[39m | [35m14.301984[39m | [35m1.9387171[39m | [35m2.8864276[39m | [35m0.8729716[39m | [35m0.2011064[39m |




| [39m3        [39m | [39m-0.415934[39m | [39m14.576631[39m | [39m2.4015930[39m | [39m2.7445892[39m | [39m1.2150419[39m | [39m0.8277403[39m |




| [39m4        [39m | [39m-5.251675[39m | [39m12.998812[39m | [39m0.001    [39m | [39m3.0      [39m | [39m0.001    [39m | [39m0.001    [39m |




| [39m5        [39m | [39m-5.621304[39m | [39m15.053697[39m | [39m2.8120203[39m | [39m3.0      [39m | [39m0.6659973[39m | [39m0.001    [39m |




| [39m6        [39m | [39m-0.411882[39m | [39m14.414030[39m | [39m2.1426417[39m | [39m2.8093288[39m | [39m1.0563456[39m | [39m0.5301617[39m |




| [39m7        [39m | [39m-0.419429[39m | [39m14.146041[39m | [39m1.9174392[39m | [39m2.6716584[39m | [39m1.3803220[39m | [39m0.9373500[39m |
|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------




| [39m1        [39m | [39m-0.496943[39m | [39m40.460790[39m | [39m0.6800465[39m | [39m1.9279589[39m | [39m2.6471840[39m | [39m2.2223556[39m |




| [35m2        [39m | [35m-0.466565[39m | [35m39.564593[39m | [35m0.8867481[39m | [35m1.5770236[39m | [35m0.5221198[39m | [35m2.4360656[39m |




| [39m3        [39m | [39m-2.422162[39m | [39m18.280412[39m | [39m2.2337832[39m | [39m2.1843859[39m | [39m2.4964602[39m | [39m0.0150037[39m |




| [39m4        [39m | [39m-8.177820[39m | [39m49.664804[39m | [39m3.0      [39m | [39m0.001    [39m | [39m0.001    [39m | [39m3.0      [39m |




| [35m5        [39m | [35m-0.108427[39m | [35m36.384931[39m | [35m0.001    [39m | [35m2.5633439[39m | [35m2.9379915[39m | [35m1.9777314[39m |




| [39m6        [39m | [39m-10.40217[39m | [39m36.339380[39m | [39m3.0      [39m | [39m0.001    [39m | [39m3.0      [39m | [39m0.001    [39m |




| [35m7        [39m | [35m-0.107044[39m | [35m38.319010[39m | [35m0.001    [39m | [35m2.5831314[39m | [35m2.1455698[39m | [35m2.5301709[39m |
|   iter    |  target   |    x0     |    x1     |    x2     |    x3     |    x4     |
-------------------------------------------------------------------------------------




| [39m1        [39m | [39m-0.472405[39m | [39m43.547203[39m | [39m2.6233683[39m | [39m1.2559684[39m | [39m1.3362145[39m | [39m0.3865150[39m |




| [35m2        [39m | [35m-0.407071[39m | [35m14.301984[39m | [35m1.9387171[39m | [35m2.8864276[39m | [35m0.8729716[39m | [35m0.2011064[39m |




| [39m3        [39m | [39m-0.416717[39m | [39m14.508662[39m | [39m2.3239814[39m | [39m2.2189669[39m | [39m1.3261403[39m | [39m0.1102159[39m |




| [39m4        [39m | [39m-7.581450[39m | [39m12.529369[39m | [39m0.001    [39m | [39m3.0      [39m | [39m0.001    [39m | [39m0.9993679[39m |




| [39m5        [39m | [39m-9.851913[39m | [39m15.380592[39m | [39m2.7687468[39m | [39m3.0      [39m | [39m0.6335389[39m | [39m0.001    [39m |




| [39m6        [39m | [39m-0.409699[39m | [39m14.320369[39m | [39m2.0741595[39m | [39m2.5188655[39m | [39m1.1364714[39m | [39m0.1695087[39m |




| [39m7        [39m | [39m-0.416634[39m | [39m14.048792[39m | [39m2.0893390[39m | [39m1.8893428[39m | [39m1.6912911[39m | [39m0.1487642[39m |




0.0
0.07777777777777778
0.9222222222222223
1.0


ValueError: math domain error

In [23]:
np.max(epsilons)

0.34385109043336326