# Code for Assessing Privacy Risk Using the Attack from Ponte et al. (2024)

https://github.com/GilianPonte/whereswaldoIJRM

In [None]:
import pandas as pd
import numpy as np
import csv
import os
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
%matplotlib inline

# Add the parent directory to path
sys.path.append('..')

# Then import
from helper_functions import *

Import a subset of the Criteo dataset.

In [None]:
train_data = pd.read_csv("../../Data/Criteo/cleaned_criteo_small.gz",
                         compression='gzip', 
                         sep='\,',
                         header=0,
                         engine='python')

Drop duplicates and reset index.

In [None]:
train_data = pd.DataFrame.drop_duplicates(train_data)
# reset dataframe index
train_data.reset_index(drop=True, inplace=True)
train_data

## Privacy Attack Simulation

The steps are as follows:

- Split the data three ways:
    - Marketer training data
    - Adversary training data
    - Outside data
- Train Marketer and Adversary synthesis models
- Compute predictions for distribution membership for outside data and compute empirical epsilon
- Repeat the above steps many times (100 iterations in Ponte et al. 2024)

Function to split into train and test sets while ensuring that the training data has an even number of rows.

In [None]:
def train_test_split_even(X, train_size, random_state=None):
    # Split the data normally
    X_train, X_test = train_test_split(
        X, train_size=train_size, random_state=random_state
    )
    
    # If train set has odd number of rows
    if len(X_train) % 2 != 0:
        # Move the last row from train to test
        X_test = pd.concat([X_test, X_train[-1:]], axis=0)
        X_train = X_train[:-1]
    
    return X_train, X_test

Parameters for synthesis models.

In [None]:
# synthesis steps
# written as a list of tuples (features, model)
synthesis_steps = [
    (["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11"], 'gmm'),
    ('treatment', 'multinomial'),
    ('exposure', 'multinomial'),
    ('visit', 'multinomial'),
    ('conversion', 'multinomial')
]

In [None]:
# parameter bounds
param_bounds = {
    'gmm': {
        'num_components': (10, 50.99),
    },
    'multinomial': {
        'treatment': {'C': (0.001, 3)},
        'exposure': {'C': (0.001, 3)},
        'visit': {'C': (0.001, 3)},
        'conversion': {'C': (0.001, 3)}
    }
}

In [None]:
number_synthetic_datasets = 10
num_iter_optimization = 25
num_init_optimization = 5
random_states = [1006, 428]
poly_degree_mnl = 2
poly_degree_pmse = 2
interaction_only = True
gmm_n_init = 3
covariance_type = "diag"

#### Set Everything Up in a Loop

In [None]:
num_obs = [300, 3000, 30000]
num_simulations = 100
epsilons = {}
seed = 42

## Note

If you are synthesizing variables out of the order in which they appear in the data, you need to re-order the initial training data to match that order.

## End Note

Order training data variables based on synthesis order.

In [None]:
var_names = []
for item in synthesis_steps:
    name = item[0]
    if type(name) == list:
        for i in name:
            var_names.append(i)
    else:
        var_names.append(name)


In [None]:
train_data = train_data[var_names]
train_data

Parallelized Version of attack (will run simulations in parallel to speed up processing). Still loops over values of N.

In [None]:
from joblib import Parallel, delayed

def process_single_simulation(n, i, current_data_sample, seed, random_states, **params):
    # Unpack all parameters from params
    (number_synthetic_datasets, synthesis_steps, param_bounds, poly_degree_mnl, 
     poly_degree_pmse, interaction_only, covariance_type, gmm_n_init, 
     num_iter_optimization, num_init_optimization) = params.values()
    
    # Split data
    internal_data, external_data = train_test_split_even(
        current_data_sample, train_size=0.67, random_state=seed+i
    )
    marketer_train, adversary_train = train_test_split(internal_data, train_size=0.5)

    N = len(marketer_train)/10
    
    def optimize_models_wrapper(data_to_synthesize, random_states):
        return [
            optimize_models(
                train_data=data_to_synthesize,
                number_synthetic_datasets=number_synthetic_datasets,
                synthesis_steps=synthesis_steps,
                param_bounds=param_bounds,
                poly_degree_mnl=poly_degree_mnl,
                poly_degree_pmse=poly_degree_pmse,
                interaction_only=interaction_only,
                covariance_type=covariance_type,
                gmm_n_init=gmm_n_init,
                random_state=r,
                num_iter_optimization=num_iter_optimization,
                num_init_optimization=num_init_optimization
            ) for r in random_states
        ]
    
    # Parallelize model optimization
    marketer_results = optimize_models_wrapper(marketer_train, random_states)
    adversary_results = optimize_models_wrapper(adversary_train, random_states)
    
    # store best params
    best_marketer_params = marketer_results[np.argmin([x['best_score'] for x in marketer_results])]['best_params']
    best_adversary_params = adversary_results[np.argmin([x['best_score'] for x in adversary_results])]['best_params']
    
    # Rest of the function remains the same...
    # train and generate with best params
    _, marketer_sXs = perform_synthesis(train_data=marketer_train,
                                        number_synthetic_datasets=2,
                                        poly_degree_mnl=poly_degree_mnl,
                                        poly_degree_pmse=poly_degree_pmse,
                                        interaction_only=interaction_only,
                                        covariance_type=covariance_type,
                                        gmm_n_init=gmm_n_init,
                                        synthesis_steps=synthesis_steps,
                                        param_values=best_marketer_params)

    _, adversary_sXs = perform_synthesis(train_data=adversary_train,
                                        number_synthetic_datasets=2,
                                        poly_degree_mnl=poly_degree_mnl,
                                        poly_degree_pmse=poly_degree_pmse,
                                        interaction_only=interaction_only,
                                        covariance_type=covariance_type,
                                        gmm_n_init=gmm_n_init,
                                        synthesis_steps=synthesis_steps,
                                        param_values=best_adversary_params)

    marketer_synthetic = marketer_sXs[0]
    adversary_synthetic = adversary_sXs[0]

    ### below code borrowed from Ponte et al. (2024)

    # step 1, 2 from paper
    bw_params = {"bandwidth": np.logspace(-1, 1, 20)} # vary the bandwith
    grid_marketer = GridSearchCV(KernelDensity(), bw_params, n_jobs = 1) # cross validate for bandwiths
    grid_marketer.fit(marketer_synthetic) # estimate pdf from train data.
    marketer_kde = grid_marketer.best_estimator_ # get best estimator

    grid_adversary = GridSearchCV(KernelDensity(), bw_params, n_jobs = 1) # cross validate (CV)
    grid_adversary.fit(adversary_synthetic) # estimate pdf from adversary data
    adversary_kde = grid_adversary.best_estimator_ # get best estimator from CV

    density_marketer = marketer_kde.score_samples(marketer_train) # score train examples from train on pdf_train
    density_adversary = adversary_kde.score_samples(marketer_train) # score train examples from train on pdf_adversary
    TPR = sum(density_marketer > density_adversary)/len(density_marketer) # calculate TPR

    density_marketer_new = marketer_kde.score_samples(external_data) # score eval_outside examples on train density
    density_adversary_new = adversary_kde.score_samples(external_data) # score eval_outside examples on adversary density
    FPR = sum(density_marketer_new > density_adversary_new)/len(density_marketer_new) # calculate FPR
    TNR = 1 - FPR
    FNR = 1 - TPR
    
    risk_vals = [(1 - (1/N) - FPR)/FNR, (1 - (1/N) - FNR)/FPR]
    
    return math.log(risk_vals[np.argmax(risk_vals)])

In [None]:
# Main loop
def process_n(n):
    current_data_sample, _ = train_test_split(
        train_data, 
        train_size=n, 
        stratify=train_data['conversion'], 
        random_state=seed
    )
    
    # Prepare parameters
    params = {
        'number_synthetic_datasets': number_synthetic_datasets,
        'synthesis_steps': synthesis_steps,
        'param_bounds': param_bounds,
        'poly_degree_mnl': poly_degree_mnl,
        'poly_degree_pmse': poly_degree_pmse,
        'interaction_only': interaction_only,
        'covariance_type': covariance_type,
        'gmm_n_init': gmm_n_init,
        'num_iter_optimization': num_iter_optimization,
        'num_init_optimization': num_init_optimization
    }
    
    # Process simulations in parallel
    results = Parallel(n_jobs=-1, verbose=10)(
        delayed(process_single_simulation)(
            n, i, current_data_sample, seed, random_states, **params
        ) for i in range(num_simulations)
    )
    
    return n, results

Run attack simulation. Save 100 empirical epsilon results for each value of n.

In [None]:
results = [process_n(n) for n in num_obs]

In [None]:
# Convert results to the original format
epsilons = {n: results[i][1] for i, n in enumerate(num_obs)}

In [None]:
epsilon_results = pd.DataFrame.from_dict(epsilons)
epsilon_results.to_csv('empirical_epsilon_results.csv')

In [None]:
# import warnings

# with warnings.catch_warnings():
#     warnings.simplefilter('ignore')

#     for n in num_obs:

#         current_data_sample, _ = train_test_split(train_data, train_size = n, stratify = train_data['conversion'], random_state=seed)

#         epsilons[n] = []

#         for i in range(num_simulations):

#             # split data
#             internal_data, external_data = train_test_split_even(current_data_sample, train_size=0.67, random_state=seed+i)
#             marketer_train, adversary_train = train_test_split(internal_data, train_size=0.5)

#             # marketer synthesis
#             marketer_optimization_results = [optimize_models(train_data=marketer_train, 
#                                                              number_synthetic_datasets=number_synthetic_datasets,
#                                                              synthesis_steps=synthesis_steps,
#                                                              param_bounds=param_bounds,
#                                                              poly_degree_mnl=poly_degree_mnl,
#                                                              poly_degree_pmse=poly_degree_pmse,
#                                                              interaction_only=interaction_only,
#                                                              covariance_type=covariance_type,
#                                                              gmm_n_init=gmm_n_init,
#                                                              random_state=r,
#                                                              num_iter_optimization=num_iter_optimization,
#                                                              num_init_optimization=num_init_optimization) for r in random_states]

#             # adversary synthesis
#             adversary_optimization_results = [optimize_models(train_data=adversary_train, 
#                                                               number_synthetic_datasets=number_synthetic_datasets,
#                                                               synthesis_steps=synthesis_steps,
#                                                               param_bounds=param_bounds,
#                                                               poly_degree_mnl=poly_degree_mnl,
#                                                               poly_degree_pmse=poly_degree_pmse,
#                                                               interaction_only=interaction_only,
#                                                               covariance_type=covariance_type,
#                                                               gmm_n_init=gmm_n_init,
#                                                               random_state=r,
#                                                               num_iter_optimization=num_iter_optimization,
#                                                               num_init_optimization=num_init_optimization) for r in random_states]

#             # store best params
#             best_marketer_params = marketer_optimization_results[np.argmin([x['best_score'] for x in marketer_optimization_results])]['best_params']
#             best_adversary_params = adversary_optimization_results[np.argmin([x['best_score'] for x in adversary_optimization_results])]['best_params']

#             # train and generate with best params
#             marketer_ratios, marketer_sXs = perform_synthesis(train_data=marketer_train,
#                                                               number_synthetic_datasets=2,
#                                                               poly_degree_mnl=poly_degree_mnl,
#                                                               poly_degree_pmse=poly_degree_pmse,
#                                                               interaction_only=interaction_only,
#                                                               covariance_type=covariance_type,
#                                                               gmm_n_init=gmm_n_init,
#                                                               synthesis_steps=synthesis_steps,
#                                                               param_values=best_marketer_params)

#             adversary_ratios, adversary_sXs = perform_synthesis(train_data=adversary_train,
#                                                                 number_synthetic_datasets=2,
#                                                                 poly_degree_mnl=poly_degree_mnl,
#                                                                 poly_degree_pmse=poly_degree_pmse,
#                                                                 interaction_only=interaction_only,
#                                                                 covariance_type=covariance_type,
#                                                                 gmm_n_init=gmm_n_init,
#                                                                 synthesis_steps=synthesis_steps,
#                                                                 param_values=best_adversary_params)

#             marketer_synthetic = marketer_sXs[0]
#             adversary_synthetic = adversary_sXs[0]

#             ### below code borrowed from Ponte et al. (2024)
#             # to prevent a naive model
#             N = len(marketer_train)/10

#             # step 1, 2 from paper
#             params = {"bandwidth": np.logspace(-1, 1, 20)} # vary the bandwith
#             grid_marketer = GridSearchCV(KernelDensity(), params, n_jobs = -1) # cross validate for bandwiths
#             grid_marketer.fit(marketer_synthetic) # estimate pdf from train data.
#             marketer_kde = grid_marketer.best_estimator_ # get best estimator

#             grid_adversary = GridSearchCV(KernelDensity(), params, n_jobs = -1) # cross validate (CV)
#             grid_adversary.fit(adversary_synthetic) # estimate pdf from adversary data
#             adversary_kde = grid_adversary.best_estimator_ # get best estimator from CV

#             density_marketer = marketer_kde.score_samples(marketer_train) # score train examples from train on pdf_train
#             density_adversary = adversary_kde.score_samples(marketer_train) # score train examples from train on pdf_adversary
#             TPR = sum(density_marketer > density_adversary)/len(density_marketer) # calculate TPR

#             density_marketer_new = marketer_kde.score_samples(external_data) # score eval_outside examples on train density
#             density_adversary_new = adversary_kde.score_samples(external_data) # score eval_outside examples on adversary density
#             FPR = sum(density_marketer_new > density_adversary_new)/len(density_marketer_new) # calculate FPR
#             TNR = 1 - FPR
#             FNR = 1 - TPR
  
#             risk_vals = [(1 - (1/N) - FPR)/FNR, (1 - (1/N) - FNR)/FPR]
#             epsilons[n].append(math.log(risk_vals[np.argmax(risk_vals)])) # append resulting epsilon to epsilons

    
    