### Assess empirical epsilon for Anand & Lee (2022) Method

In [1]:
import math
import numpy as np
import statistics
from sklearn import metrics
from __future__ import print_function, division
from functools import partial
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import argparse
import keras
from tensorflow.keras import backend as K
from sklearn.linear_model import LinearRegression
import sys
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
import pandas as pd
import io
from keras.models import load_model
import time
from scipy.stats import pearsonr
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, GaussianNoise
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers import MaxPooling2D, LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D, Conv1D
from keras.models import Sequential, Model
from keras import losses
import keras.backend as K
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import os
from sklearn.model_selection import train_test_split
import random

ImportError: Traceback (most recent call last):
  File "C:\Users\cdbale\AppData\Roaming\Python\Python311\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [None]:


# set global seeds
seed=1
os.environ['PYTHONHASHSEED'] = str(seed)
# For working on GPUs from "TensorFlow Determinism"
os.environ["TF_DETERMINISTIC_OPS"] = str(seed)
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)
print(random.random())

# # define utility
# def utility(real_data, protected_data):
#   from sklearn.linear_model import LinearRegression
#   from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
#   reg = LinearRegression()
#   reg.fit(np.array(real_data)[:,1:9],np.array(real_data)[:,0])
#   reg_protect = LinearRegression()
#   reg_protect.fit(np.array(protected_data)[:,1:9],np.array(protected_data)[:,0])
#   MAPD = mean_absolute_percentage_error(reg.coef_, reg_protect.coef_)*100
#   MAE = mean_absolute_error(reg.coef_, reg_protect.coef_)
#   MSE = mean_squared_error(reg.coef_, reg_protect.coef_)
#   return MAPD, MAE, MSE

"""# Anand and lee (2022)"""

## in the paper we had the following optimal settings:
N = 1262423
samples = N
iterations = (100000)+1
batch_size = 128

epochs = iterations/(N/batch_size)
print(epochs)

#### Keeping these parameters as in Ponte et al.

## will use N*3 as reference for desired sample sizes (i.e., 100, 3000, 10000)



# therefore we want to have the same number of epochs for smaller sample sizes
N = 10000
samples = int(N*3)
iterations = 1000
batch_size = 100
epochs = iterations/(N/batch_size)
print(epochs)

class GAN():
    def __init__(self, privacy):
      self.img_rows = 1
      self.img_cols = 1
      self.img_shape = (self.img_cols,)
      self.latent_dim = (1)

      optimizer = keras.optimizers.Adam()
      self.discriminator = self.build_discriminator()
      self.discriminator.compile(loss='binary_crossentropy',
                                 optimizer=optimizer,
                                 metrics=['accuracy'])
      if privacy == True:
        print("using differential privacy")
        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(optimizer=DPKerasAdamOptimizer(
            l2_norm_clip=l2_norm_clip,
            noise_multiplier=noise_multiplier,
            num_microbatches=num_microbatches,
            learning_rate=lr),
            loss= tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.losses.Reduction.NONE), metrics=['accuracy'])

      # Build the generator
      self.generator = self.build_generator()

      # The generator takes noise as input and generates imgs
      z = Input(shape=(self.latent_dim,))
      img = self.generator(z)

      # For the combined model we will only train the generator
      self.discriminator.trainable = False

      # The discriminator takes generated images as input and determines validity
      valid = self.discriminator(img)

      # The combined model  (stacked generator and discriminator)
      # Trains the generator to fool the discriminator
      self.combined = Model(z, valid)
      self.combined.compile(loss='binary_crossentropy', optimizer= optimizer)


    def build_generator(self):
      model = Sequential()
      model.add(Dense(self.latent_dim, input_dim=self.latent_dim))
      model.add(LeakyReLU(alpha=0.2))
      #model.add(BatchNormalization())
      model.add(Dense(1024, input_shape=self.img_shape))
      model.add(LeakyReLU(alpha=0.2))
      #model.add(BatchNormalization())
      model.add(Dense(self.latent_dim))
      model.add(Activation("tanh"))

      #model.summary()

      noise = Input(shape=(self.latent_dim,))
      img = model(noise)
      return Model(noise, img)

    def build_discriminator(self):

        model = Sequential()

        model.add(Dense(1024, input_shape=self.img_shape))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(1, activation='sigmoid'))

        #model.summary()

        img = Input(shape=self.img_shape)
        validity = model(img)

        return Model(img, validity)

    def train(self, data, iterations, batch_size, model_name, generator_losses = [], discriminator_acc = [], correlations = [], accuracy = [], MAPD_col = [],MSE_col = [], MAE_col = []):
      # Adversarial ground truths

      valid = np.ones((batch_size, 1))
      fake = np.zeros((batch_size, 1))
      corr = 0
      MAPD = 0
      MSE = 0
      MAE = 0
      #fake += 0.05 * np.random.random(fake.shape)
      #valid += 0.05 * np.random.random(valid.shape)

      for epoch in range(iterations):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random batch of images
            idx = np.random.randint(0, data.shape[0], batch_size)
            imgs = data[idx]

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

            # Generate a batch of new images
            gen_imgs = self.generator.predict(noise, verbose = False)

            # Train the discriminator
            d_loss_real = self.discriminator.train_on_batch(imgs, valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_imgs, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------
            # Train the generator (to have the discriminator label samples as valid)

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            g_loss = self.combined.train_on_batch(noise, valid)

            # collect losses
            discriminator_acc = np.append(discriminator_acc, 100*d_loss[1])
            generator_losses = np.append(generator_losses, g_loss)
      self.generator.save(model_name)
              #print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f, corr: %f, MAPD: %f, MSE: %f, MAE: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss, corr, MAPD, MSE, MAE))

random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)
epsilons = np.array([])
MAPD_col = np.array([])
MAE_col = np.array([])
MSE_col = np.array([])

for iter in range(0,100):
  random.seed(iter)
  np.random.seed(iter)
  tf.random.set_seed(iter)
  churn = pd.read_csv('data.csv', sep = ',', na_values=['(NA)']).fillna(0)
  churn = pd.DataFrame.drop_duplicates(churn)
  churn, evaluation_outside_training = train_test_split(churn, train_size = int(samples*2/3), test_size = int(30000), stratify = churn['Churn'])
  train_original, adversary_training = train_test_split(churn, train_size = int(samples*1/3), stratify= churn['Churn'])
  N = len(train_original)/10

  train_outcome = train_original[['Tenure']]
  train_covariates = train_original.drop('Tenure', axis=1)

  adversary_training_outcome = adversary_training[['Tenure']]
  adversary_training_covariates = adversary_training.drop('Tenure', axis=1)

  from sklearn.preprocessing import MinMaxScaler
  scaler0 = MinMaxScaler(feature_range= (-1, 1))
  scaler0 = scaler0.fit(train_outcome)
  train_outcome = scaler0.transform(train_outcome)
  train_outcome = pd.DataFrame(train_outcome)

  print("start train set training")
  gan_train = GAN(privacy = False)
  gan_train.train(data = np.array(train_outcome), iterations=iterations, batch_size=batch_size, model_name = "train_anand.h5")

  # Generate a batch of new customers
  generator = load_model('train_anand.h5', compile = True)
  noise = np.random.normal(0, 1, (len(train_outcome), 1))
  gen_imgs = generator.predict(noise, verbose = False)
  gen_imgs = scaler0.inverse_transform(gen_imgs)
  gen_imgs = gen_imgs.reshape(len(train_outcome), 1)
  train_GAN = pd.DataFrame(gen_imgs)

  # adversary has access to the model and samples another adversary_sample
  print("start adversary set training")

  from sklearn.preprocessing import MinMaxScaler
  scaler1 = MinMaxScaler(feature_range= (-1, 1))
  scaler1 = scaler1.fit(adversary_training_outcome)
  adversary_training_outcome = scaler1.transform(adversary_training_outcome)
  adversary_training_outcome = pd.DataFrame(adversary_training_outcome)

  gan_adv = GAN(privacy = False)
  gan_adv.train(data = np.array(adversary_training_outcome), iterations=iterations, batch_size=batch_size, model_name = "adversary_anand.h5")

  generator = load_model('adversary_anand.h5', compile = True)
  generated_data = []

  noise = np.random.normal(0, 1, (len(adversary_training_outcome), 1))
  # Generate a batch of new images
  gen_imgs = generator.predict(noise, verbose = False)
  gen_imgs = scaler1.inverse_transform(gen_imgs)
  gen_imgs = gen_imgs.reshape(len(adversary_training_outcome), 1)
  adversary_training_GAN = pd.DataFrame(gen_imgs)

  # combine one protected variable with other
  train = pd.concat([train_covariates.reset_index(drop = True), train_GAN], axis=1)
  adversary = pd.concat([adversary_training_covariates.reset_index(drop = True), adversary_training_GAN], axis=1)

  # stap 1, 2
  train.rename(columns = {0:'Tenure'}, inplace = True)
  adversary.rename(columns = {0:'Tenure'}, inplace = True)
  params = {"bandwidth": np.logspace(-1, 1, 20)}
  grid_train = GridSearchCV(KernelDensity(), params, n_jobs = -1)
  grid_train.fit(train)
  kde_train = grid_train.best_estimator_

  params = {"bandwidth": np.logspace(-1, 1, 20)}
  grid = GridSearchCV(KernelDensity(), params, n_jobs = -1)
  grid.fit(adversary)
  kde_adversary = grid.best_estimator_
  evaluation_outside_training = evaluation_outside_training[['Churn','Sex', 'Age', 'Contact', 'Household_size', 'Social_class', 'Income', 'Ethnicity', 'Tenure']]

  # stap 3
  density_train = kde_train.score_samples(train) # f1
  density_adversary = kde_adversary.score_samples(train) # f2
  #print(density_train > density_adversary)  # f1 > f2
  TPR = sum(density_train > density_adversary)/len(density_train) # all training!

  # stap 4
  density_train_new = kde_train.score_samples(evaluation_outside_training) # f1
  density_adversary_new = kde_adversary.score_samples(evaluation_outside_training) # f2
  #density_train_new > density_adversary_new  # f1 > f2
  #print(density_train_new > density_adversary_new)  # f1 > f2
  FPR = sum(density_train_new > density_adversary_new)/len(density_train_new) # random!
  TNR = 1 - FPR
  FNR = 1 - TPR
  print("FPR is " + str(FPR))
  print("FNR is " + str(FNR))
  print("TPR is " + str(TPR))
  print("TNR is " + str(TNR))
  try:
    epsilons = np.append(epsilons,max(math.log((1 - (1/N) - FPR)/FNR), math.log((1 - (1/N) - FNR)/FPR)))
    print("empirical epsilon = " + str(max(math.log((1 - (1/N) - FPR)/FNR), math.log((1 - (1/N) - FNR)/FPR))))
  except:
    epsilons = np.append(epsilons, math.log((1 - (1/N) - FPR)/FNR))
    print("empirical epsilon = " + str(math.log((1 - (1/N) - FPR)/FNR)))

  # utility
  MAPD_train, MAE_train, MSE_train = utility(real_data = train, protected_data = train_GAN)
  MAPD_adv, MAE_adv, MSE_adv = utility(real_data = train, protected_data = adversary_training_GAN)
  MAPD_col = np.append(MAPD_col, ((MAPD_train+MAPD_adv)/2))
  MAE_col = np.append(MAE_col, ((MAE_train+MAE_adv)/2))
  MSE_col = np.append(MSE_col, ((MSE_train+MSE_adv)/2))
  print("MAPD train = " + str(MAPD_train))
  print("MAPD adversary = " + str(MAPD_adv))

np.savetxt("epsilons_anand_30000.csv", epsilons, delimiter=",")
np.savetxt("MAPD_anand_30000.csv", MAPD_col, delimiter=",")
np.savetxt("MAE_anand_30000.csv", MAE_col, delimiter=",")
np.savetxt("MSE_anand_30000.csv", MSE_col, delimiter=",")