# Code for Assessing Privacy Risk Using the Attack from Ponte et al. (2024)

https://github.com/GilianPonte/whereswaldoIJRM

In [1]:
import pandas as pd
import numpy as np
import csv
import os
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
%matplotlib inline

# Add the parent directory to path
sys.path.append('..')

# Then import
from helper_functions import *

Import the oversampled subset of the Criteo dataset.

In [2]:
train_data = pd.read_csv("../../Data/Criteo/cleaned_criteo_os.gz",
                         compression='gzip', 
                         sep='\,',
                         header=0,
                         engine='python')

Drop duplicates and reset index.

In [3]:
train_data = pd.DataFrame.drop_duplicates(train_data)
# reset dataframe index
train_data.reset_index(drop=True, inplace=True)
train_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure
0,12.616365,10.059654,8.928801,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,0,0,0,0
1,12.616365,10.059654,9.038744,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,0,0,0,0
2,12.616365,10.059654,8.322806,4.679882,10.280525,4.115453,0.294443,4.833815,3.906514,25.240993,5.300375,-0.168679,1,0,0,0
3,25.385197,10.059654,8.214383,4.679882,10.280525,4.115453,-3.993764,4.833815,3.971858,13.190056,5.300375,-0.168679,1,0,0,0
4,22.293259,10.059654,8.214383,4.679882,10.280525,4.115453,-3.993764,4.833815,3.971858,13.190056,5.300375,-0.168679,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447157,13.680284,10.059654,8.325934,-0.600592,11.029584,1.128518,-13.045950,10.885556,3.758296,44.784329,5.844038,-0.267350,1,1,1,1
447158,14.251906,13.579750,8.303577,-2.272900,12.594889,-4.636110,-19.328059,5.621479,3.755250,42.018683,6.141586,-0.168679,1,1,1,1
447159,20.711370,10.059654,8.290111,4.679882,10.280525,4.115453,-6.359690,4.833815,3.813849,26.606156,5.300375,-0.168679,1,1,1,1
447160,23.767207,10.059654,8.283185,4.679882,10.280525,4.115453,-3.282109,4.833815,3.767224,46.714867,5.300375,-0.168679,1,1,1,0


After removing duplicates, we still have approximately 10% observations with conversion = 1.

In [4]:
conversion_counts = np.unique(train_data.conversion, return_counts = True)
conversion_counts/np.sum(conversion_counts)

array([[0.00000000e+00, 2.23632098e-06],
       [9.08814012e-01, 9.11837518e-02]])

## Privacy Attack Simulation

The steps are as follows:

- Split the data three ways:
    - Marketer training data
    - Adversary training data
    - Outside data
- Train Marketer and Adversary synthesis models
- Compute predictions for distribution membership for outside data and compute empirical epsilon
- Repeat the above steps many times (100 iterations in Ponte et al. 2024)

Function to split into train and test sets while ensuring that the training data has an even number of rows.

In [5]:
def train_test_split_even(X, train_size, strat_var, random_state=None):
    
    # Split the data normally
    # stratify based on strat var, if it exists
    if strat_var:
        X_train, X_test = train_test_split(
            X, train_size=train_size, stratify=X[strat_var], random_state=random_state)
    else:
        X_train, X_test = train_test_split(
            X, train_size=train_size, random_state=random_state)
    
    # If train set has odd number of rows
    if len(X_train) % 2 != 0:
        # Move the last row from train to test
        X_test = pd.concat([X_test, X_train[-1:]], axis=0)
        X_train = X_train[:-1]
    
    return X_train, X_test

Function to evaluate utility of synthetic data based on the mean-absolute percentage error (MAPE), mean-absolute error, and mean-squared error between regression coefficients estimated on the real and synthetic data.

In [None]:
# define utility
def utility(real_data, protected_data):
    
    # import error metrics
    from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

    # estimate logistic regression coefficients for real_data
    logit_params_original = logit_params(X = real_data.drop('conversion', axis=1), y = real_data['conversion'])

    # estimate logistic regression coefficients for protected_data
    logit_params_protected = logit_params(X = protected_data.drop('conversion', axis=1), y = protected_data['conversion'])

    # compute error metrics
    MAPE = mean_absolute_percentage_error(logit_params_original, logit_params_protected)*100
    MAE = mean_absolute_error(logit_params_original, logit_params_protected)
    MSE = mean_squared_error(logit_params_original, logit_params_protected)
    return MAPE, MAE, MSE

Parameters for synthesis models.

In [7]:
# define some default bounds for leaf values. These will be appropriately filled in during the loop below

param_bounds = {
    'tree': {
        'f0': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [10, 1000]  # [min, max] bounds
        },
        'f1': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [5, 5]  # [min, max] bounds
        },
        'f2': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [5, 5]  # [min, max] bounds
        },
        'f3': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [10, 1000]  # [min, max] bounds
        },
        'f4': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [5, 5]  # [min, max] bounds
        },
        'f5': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [10, 1000]  # [min, max] bounds
        },
        'f6': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [10, 1000]  # [min, max] bounds
        },
        'f7': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [5, 5]  # [min, max] bounds
        },
        'f8': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [4063, 10000]  # [min, max] bounds
        },
        'f9': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [5, 5]  # [min, max] bounds
        },
        'f10': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [5, 5]  # [min, max] bounds
        },
        'f11': {  # Applies to all tree-synthesized variables
            'min_samples_leaf': [10, 1000]  # [min, max] bounds
        }
    }
}

In [8]:
number_synthetic_datasets = 10
num_iter_optimization = 25
num_init_optimization = 5
random_states = [1006, 428]

#### Set Everything Up in a Loop

In [9]:
# num_obs = [300, 3000, 30000]
# using 3X the desired training data size, which gets split into thirds (marketer_train, adversary_train, external_data)
num_obs = [300, 400]
# num_simulations = 100
num_simulations = 5
epsilons = {}
seed = 42

## Note

If you are synthesizing variables out of the order in which they appear in the data, you need to re-order the initial training data to match that order. This is done in the loop below already.

## End Note

Parallelized Version of attack (will run simulations in parallel to speed up processing). Still loops over values of N.

In [10]:
from joblib import Parallel, delayed

def process_single_simulation(n, i, current_data_sample, strat_var, seed, random_states, **params):
    # Unpack all parameters from params
    (number_synthetic_datasets, param_bounds, num_iter_optimization, num_init_optimization) = params.values()
    
    # Split data into training data and external data (which won't be included in marketer or adversary training data)
    internal_data, external_data = train_test_split_even(
        current_data_sample, train_size=2/3, strat_var=strat_var, random_state=seed+i)

    # define training sets for the marketer and the adversary
    marketer_train, adversary_train = train_test_split(internal_data, train_size=0.5, stratify=internal_data[strat_var])

    #### Define synthesis inputs for marketer ####

    # define order of synthesis and the bounds of synthesis for the marketer
    marketer_cols, marketer_steps, marketer_bounds = define_synthesis_steps(marketer_train, param_bounds)

    # reorder the columns in the training data to match the synthesis order
    marketer_train = marketer_train[marketer_cols]

    # define the target variable for the user model
    # we use the same target variable as stratification variable
    # define the exogenous variables for the user model
    marketer_exog_variables = list(marketer_train.drop(strat_var, axis=1).columns)

    # parameter values from the training data
    marketer_target_params = logit_params(X = marketer_train[marketer_exog_variables], y = marketer_train[strat_var])

    #### Define synthesis inputs for adversary ####
    
    # define order of synthesis and the bounds of synthesis for the marketer
    adversary_cols, adversary_steps, adversary_bounds = define_synthesis_steps(adversary_train, param_bounds)

    # reorder the columns in the training data to match the synthesis order
    adversary_train = adversary_train[adversary_cols]

    # define the exogenous variables for the user model
    adversary_exog_variables = list(adversary_train.drop(strat_var, axis=1).columns)

    # parameter values from the training data
    adversary_target_params = logit_params(X = adversary_train[adversary_exog_variables], y = adversary_train[strat_var])

    N = len(marketer_train)/10
    
    def optimize_models_wrapper(data_to_synthesize, steps_to_follow, bounds_to_use, params_to_target, x_variables, random_states):
        return [
            optimize_models_with_param_target(train_data=data_to_synthesize,
                                              number_synthetic_datasets=number_synthetic_datasets,
                                              synthesis_steps=steps_to_follow,
                                              param_bounds=bounds_to_use,
                                              random_state=r,
                                              target_params=params_to_target,
                                              target_variable=strat_var,
                                              exog_variables=x_variables,
                                              n_iter=num_iter_optimization,
                                              n_init=num_init_optimization) for r in random_states
        ]
    
    # Parallelize model optimization
    marketer_results = optimize_models_wrapper(marketer_train, marketer_steps, marketer_bounds, marketer_target_params, marketer_exog_variables, random_states)
    adversary_results = optimize_models_wrapper(adversary_train, adversary_steps, adversary_bounds, adversary_target_params, adversary_exog_variables, random_states)
    
    # store best params
    best_marketer_params = marketer_results[np.argmin([x['best_score'] for x in marketer_results])]['best_params']
    best_adversary_params = adversary_results[np.argmin([x['best_score'] for x in adversary_results])]['best_params']
    
    # Rest of the function remains the same...
    # train and generate with best params
    _, marketer_sXs = perform_synthesis_with_param_target(
        train_data=marketer_train,
        number_synthetic_datasets=2,
        synthesis_steps=marketer_steps,
        target_params=marketer_target_params,
        target_variable=strat_var,
        exog_variables=marketer_exog_variables,
        param_values=best_marketer_params)

    _, adversary_sXs = perform_synthesis_with_param_target(
        train_data=adversary_train,
        number_synthetic_datasets=2,
        synthesis_steps=adversary_steps,
        target_params=adversary_target_params,
        target_variable=strat_var,
        exog_variables=adversary_exog_variables,
        param_values=best_adversary_params)

    marketer_synthetic = marketer_sXs[0]
    adversary_synthetic = adversary_sXs[0]

    # for consistent evaluation below, ensure that column orderings are the same in all data sets
    # we haven't touched the external data yet, so we know it preserves the original column order
    marketer_train = marketer_train[external_data.columns]
    adversary_train = adversary_train[external_data.columns]
    marketer_synthetic = marketer_synthetic[external_data.columns]
    adversary_synthetic = adversary_synthetic[external_data.columns]

    # evaluate utility of logistic regression coefficients
    # the columns have been consistently reordered so coefficient orders will match
    marketer_mape, marketer_mae, marketer_mse = utility(marketer_train, marketer_synthetic)
    adversary_mape, adversary_mae, adversary_mse = utility(adversary_train, adversary_synthetic)

    # average utility measures
    MAPE = (marketer_mape + adversary_mape)/2
    MAE = (marketer_mae + adversary_mae)/2
    MSE = (marketer_mse + adversary_mse)/2

    ### below code borrowed from Ponte et al. (2024)

    # step 1, 2 from paper
    bw_params = {"bandwidth": np.logspace(-1, 1, 20)} # vary the bandwith
    grid_marketer = GridSearchCV(KernelDensity(), bw_params, n_jobs = 1) # cross validate for bandwiths
    grid_marketer.fit(marketer_synthetic) # estimate pdf from train data.
    marketer_kde = grid_marketer.best_estimator_ # get best estimator

    grid_adversary = GridSearchCV(KernelDensity(), bw_params, n_jobs = 1) # cross validate (CV)
    grid_adversary.fit(adversary_synthetic) # estimate pdf from adversary data
    adversary_kde = grid_adversary.best_estimator_ # get best estimator from CV

    density_marketer = marketer_kde.score_samples(marketer_train) # score train examples from train on pdf_train
    density_adversary = adversary_kde.score_samples(marketer_train) # score train examples from train on pdf_adversary
    TPR = sum(density_marketer > density_adversary)/len(density_marketer) # calculate TPR

    density_marketer_new = marketer_kde.score_samples(external_data) # score eval_outside examples on train density
    density_adversary_new = adversary_kde.score_samples(external_data) # score eval_outside examples on adversary density
    FPR = sum(density_marketer_new > density_adversary_new)/len(density_marketer_new) # calculate FPR
    TNR = 1 - FPR
    FNR = 1 - TPR
    
    risk_vals = [(1 - (1/N) - FPR)/FNR, (1 - (1/N) - FNR)/FPR]
    
    return {'epsilon': math.log(risk_vals[np.argmax(risk_vals)]), 'MAPE': MAPE, 'MAE': MAE, 'MSE': MSE}

In [11]:
# Main loop
def process_n(n, strat_var):

    # create current_data_sample by splitting the original data, stratified by 'conversion'
    current_data_sample, _ = train_test_split(
        train_data, 
        train_size=n, 
        stratify=train_data[strat_var], 
        random_state=seed
    )

    # Prepare parameters
    params = {
        'number_synthetic_datasets': number_synthetic_datasets,
        'param_bounds': param_bounds,
        'num_iter_optimization': num_iter_optimization,
        'num_init_optimization': num_init_optimization
    }
    
    # Process simulations in parallel
    # using -5 to leave a few cores free to use computer while code is running
    results = Parallel(n_jobs=-5, verbose=10)(
        delayed(process_single_simulation)(
            n, i, current_data_sample, strat_var, seed, random_states, **params
        ) for i in range(num_simulations)
    )
    
    return n, results

Run attack simulation. Save 100 empirical epsilon results for each value of n.

In [12]:
results = [process_n(n, strat_var='conversion') for n in num_obs]

[Parallel(n_jobs=-5)]: Using backend LokyBackend with 28 concurrent workers.
[Parallel(n_jobs=-5)]: Done   2 out of   5 | elapsed:   47.0s remaining:  1.2min
[Parallel(n_jobs=-5)]: Done   3 out of   5 | elapsed:  1.0min remaining:   40.8s
[Parallel(n_jobs=-5)]: Done   5 out of   5 | elapsed:  1.0min remaining:    0.0s


UnboundLocalError: cannot access local variable 'logit_params' where it is not associated with a value

In [31]:
# Convert results to the original format
epsilons = {n: results[i][1] for i, n in enumerate(num_obs)}

In [32]:
epsilon_results = pd.DataFrame.from_dict(epsilons)
epsilon_results.to_csv('empirical_epsilon_results.csv', index=False)

In [33]:
epsilon_results.max(axis=0)

900      0.252496
9000     0.129666
90000    0.061733
dtype: float64