# Synthesize Criteo data using the DP-GAN approach of Ponte. et al.

Note that we use the following python modules:

- tensorflow==2.15.0
- keras==2.15.0
- tensorflow-estimator==2.15.0
- tensorflow-privacy==0.9.0
- numpy==1.26.4
- pandas==2.2.2
- scikit-learn==1.4.2
- scipy==1.11.4
- absl-py==1.4.0

The following statments can be used to install the required python modules:

```bash
pip install tensorflow==2.15.0
pip install keras==2.15.0
pip install tensorflow-estimator==2.15.0
pip install tensorflow-privacy==0.9.0
pip install numpy==1.26.4
pip install pandas==2.2.2
pip install scikit-learn==1.4.2
pip install scipy==1.11.4
pip install absl-py==1.4.0
```

Perform a quick check that `tensorflow`, `keras` and `tensorflow_privacy` are installed and importable. Also check versions of `NumPy`, `Pandas`, and `Scikit-learn`.

In [1]:
# sanity check the environment
import tensorflow as tf, keras, numpy as np, pandas as pd, sklearn
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer

print("TF:", tf.__version__)              # 2.15.0
print("Keras:", keras.__version__)        # 2.15.0
print("NumPy:", np.__version__)           # 1.26.4
print("Pandas:", pd.__version__)          # 2.2.2
print("Sklearn:", sklearn.__version__)    # 1.4.2
_ = DPKerasAdamOptimizer(l2_norm_clip=1.0, noise_multiplier=0.5,
                         num_microbatches=1, learning_rate=1e-3)
print("DP optimizer OK")





TF: 2.15.0
Keras: 2.15.0
NumPy: 1.26.4
Pandas: 2.2.2
Sklearn: 1.4.2
DP optimizer OK


Import required packages.

In [2]:
import math
import numpy as np
import statistics
from sklearn import metrics
from functools import partial
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import argparse
import keras
from tensorflow.keras import backend as K
from sklearn.linear_model import LinearRegression
import sys
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
import pandas as pd
import io
from keras.models import load_model
import time
from scipy.stats import pearsonr
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, GaussianNoise
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers import MaxPooling2D, LeakyReLU
from keras.layers import UpSampling2D, Conv2D, Conv1D
from keras.models import Sequential, Model
from keras import losses
import keras.backend as K
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import os
from sklearn.model_selection import train_test_split
import random
from keras.models import load_model
from absl import app
from absl import flags
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer, DPKerasAdamOptimizer
from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy
from sklearn.preprocessing import MinMaxScaler

Import Criteo data (small version is for testing, results are based on 'full' version).

In [3]:
# train_data = pd.read_csv("../../Data/Criteo/cleaned_criteo_small.gz",
#                          compression='gzip', 
#                          sep='\,',
#                          header=0,
#                          engine='python')
# data_set = "small"

train_data = pd.read_csv("../../Data/Criteo/cleaned_criteo.gz",
                         compression='gzip', 
                         sep='\,',
                         header=0,
                         engine='python')
data_set = "full"

View confidential data to synthesize.

In [4]:
train_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure
0,12.616365,10.059654,8.976429,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
1,12.616365,10.059654,9.002689,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
2,12.616365,10.059654,8.964775,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
3,12.616365,10.059654,9.002801,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
4,12.616365,10.059654,9.037999,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13979587,26.297764,10.059654,9.006250,4.679882,10.280525,4.115453,-3.282109,4.833815,3.839578,13.190056,5.300375,-0.168679,1,0,0,0
13979588,12.642207,10.679513,8.214383,-1.700105,10.280525,3.013064,-13.955150,6.269026,3.971858,13.190056,5.300375,-0.168679,1,0,0,1
13979589,12.976557,10.059654,8.381868,0.842442,11.029584,4.115453,-8.281971,4.833815,3.779212,23.570168,6.169187,-0.168679,1,0,1,0
13979590,24.805064,10.059654,8.214383,4.679882,10.280525,4.115453,-1.288207,4.833815,3.971858,13.190056,5.300375,-0.168679,1,0,0,0


Define a class for estimating a differentially private GAN.

In [5]:
"""# GANs with differential privacy"""
class GAN():
    def __init__(self, privacy):
      self.img_rows = 1
      self.img_cols = 16
      self.img_shape = (self.img_cols,)
      self.latent_dim = (16)
      lr = 0.001

      optimizer = keras.optimizers.Adam()
      self.discriminator = self.build_discriminator()
      self.discriminator.compile(loss='binary_crossentropy',
                                 optimizer=optimizer,
                                 metrics=['accuracy'])
      if privacy == True:
        # print(noise_multiplier)
        # print("using differential privacy")
        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(optimizer=DPKerasAdamOptimizer(
            l2_norm_clip=4,
            noise_multiplier=noise_multiplier,
            num_microbatches=num_microbatches,
            learning_rate=lr),
            loss= tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.losses.Reduction.NONE), metrics=['accuracy'])

      # Build the generator
      self.generator = self.build_generator()

      # The generator takes noise as input and generates imgs
      z = Input(shape=(self.latent_dim,))
      img = self.generator(z)

      # For the combined model we will only train the generator
      self.discriminator.trainable = False

      # The discriminator takes generated images as input and determines validity
      valid = self.discriminator(img)

      # The combined model  (stacked generator and discriminator)
      # Trains the generator to fool the discriminator
      self.combined = Model(z, valid)
      self.combined.compile(loss='binary_crossentropy', optimizer= optimizer)


    def build_generator(self):
      model = Sequential()
      model.add(Dense(self.latent_dim, input_dim=self.latent_dim))
      model.add(LeakyReLU(alpha=0.2))
      #model.add(BatchNormalization())
      model.add(Dense(64, input_shape=self.img_shape))
      model.add(LeakyReLU(alpha=0.2))
      #model.add(BatchNormalization())
      model.add(Dense(self.latent_dim))
      model.add(Activation("tanh"))

      #model.summary()

      noise = Input(shape=(self.latent_dim,))
      img = model(noise)
      return Model(noise, img)

    def build_discriminator(self):

        model = Sequential()

        model.add(Dense(64, input_shape=self.img_shape))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(1, activation='sigmoid'))

        #model.summary()

        img = Input(shape=self.img_shape)
        validity = model(img)

        return Model(img, validity)

    def train(self, data, iterations, batch_size, sample_interval, model_name, generator_losses = [], discriminator_acc = [], correlations = [], accuracy = [], MAPD_collect = [],MSE_collect = [], MAE_collect = []):
      # Adversarial ground truths
      valid = np.ones((batch_size, 1))
      fake = np.zeros((batch_size, 1))
      corr = 0
      MAPD = 0
      MSE = 0
      MAE = 0
      #fake += 0.05 * np.random.random(fake.shape)
      #valid += 0.05 * np.random.random(valid.shape)

      for epoch in range(iterations):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random batch of images
            idx = np.random.randint(0, data.shape[0], batch_size)
            imgs = data[idx]

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

            # Generate a batch of new images
            gen_imgs = self.generator.predict(noise, verbose = False)

            # Train the discriminator
            d_loss_real = self.discriminator.train_on_batch(imgs, valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_imgs, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------
            # Train the generator (to have the discriminator label samples as valid)

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            g_loss = self.combined.train_on_batch(noise, valid)

            if (epoch % 100) == 0:
              print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

      self.generator.save(model_name)

Set number of samples (total number of observations). Determine epochs as a function of batch size, which we leave fixed at 100 (same as Ponte et al.), and scale iterations to have `epochs = 10`.

In [6]:
# number of samples in the data set
samples = int(train_data.shape[0])

# setting epsilon
N = len(train_data)
batch_size = 100

In [7]:
### change for different data sizes
iterations = 10000
epochs = iterations/(N/batch_size) # should be 10
num_microbatches = batch_size # see validation section paper.

# the noise_multiplier is not directly passed to the GAN, but the GAN code reads it from the global environment
l2_norm_clip = 4 # see paper in validation section.
delta = 1/N # should be 1/N

In [8]:
epochs

0.07153284587990837

In [9]:
# define a list of different noise multipliers to use for synthesis
# noise multipliers for smaller criteo data
# noise_multipliers = [0.419205, 0.6227, 0.91265, 1.2243, 6.5]
# noise multipliers for full Criteo data
noise_multipliers = [0.281585, 0.49147, 0.7668, 1.045, 3.2849]

Choose noise multipliers that map to $\epsilon = 13, 3, 1, 0.5, 0.05$. The `tensorflow-privacy` package has deprecated the use of the `compute_dp_sgd_privacy` function, replacing it with `compute_dp_sgd_privacy_statement` which properly accounts for doubling sensitivity due to microbatching and does not assume Poisson subsampling. However, we use the existing methods from Ponte et al. for consistency, and note that the theoretical epsilon is higher than what is reported.

In [10]:
# calculate the theoretical bound of epsilon
[np.round(compute_dp_sgd_privacy(n = N, 
                                 batch_size = batch_size,
                                 epochs = epochs,
                                 noise_multiplier = x,
                                 delta = delta)[0], 3) for x in noise_multipliers] 



[13.0, 3.0, 1.0, 0.5, 0.05]

In [11]:
# import warnings
# warnings.filterwarnings('ignore')

# import os
# import logging
# import tensorflow as tf
# from absl import logging as absl_logging

# # Suppress low-level TF C++ logs (0=all, 1=INFO, 2=WARNING, 3=ERROR)
# os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# # Suppress Python-level TF warnings
# tf.get_logger().setLevel(logging.ERROR)
# logging.getLogger("tensorflow").setLevel(logging.ERROR)
# absl_logging.set_verbosity(absl_logging.ERROR)

all_synthetic_datasets = {}

"""iteraties en batch size hetzelfde houden."""
random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)

start_time = time.time()

# scale data for GAN training
scaler0 = MinMaxScaler(feature_range = (-1, 1))
scaler0 = scaler0.fit(train_data)
train_GAN_real = scaler0.transform(train_data)
train_GAN_real = pd.DataFrame(train_GAN_real)

# we vary the noise multipliers here, train a GAN and generate multiple synthetic data sets for each noise multiplier
for iter, noise_multiplier in enumerate(noise_multipliers): 
  random.seed(iter)
  np.random.seed(iter)
  tf.random.set_seed(iter)

  # train GAN on train data
  gan_train = GAN(privacy = True)
  gan_train.train(data = np.array(train_GAN_real), iterations=iterations, batch_size=batch_size, sample_interval=((iterations-1)/10), model_name = "train_1.h5")

  print('Model trained.')

  # list to store synthetic data sets
  synthetic_datasets = []
  # load model
  generator = load_model('train_1.h5')
  # for 20 iterations
  for data_num in range(20):
    # set random seeds
    random.seed(data_num)
    np.random.seed(data_num)
    tf.random.set_seed(data_num)
    # generate a synthetic data set
    synthetic_datasets.append(generator.predict(np.random.normal(0, 1, (samples, 16)), verbose = False))
    print('Created ' + str(data_num+1) + "/" + "20 synthetic data sets.")
  # invert the min-max transformation
  synthetic_datasets = [scaler0.inverse_transform(X) for X in synthetic_datasets]
  # reshape to correct size
  synthetic_datasets = [pd.DataFrame(X.reshape(samples, 16)) for X in synthetic_datasets]
  # replace column names and round categorical variables
  for X in synthetic_datasets:
    # replace column names
    X.columns = train_data.columns.values
    ####################################################
    # round the values of categorical variables, as done by Ponte et al.
    ####################################################
    X['treatment'] = X['treatment'].round()
    X['conversion'] = X['conversion'].round()
    X['visit'] = X['visit'].round()
    X['exposure'] = X['exposure'].round()

  all_synthetic_datasets[str(noise_multiplier)] = synthetic_datasets










  output, from_logits = _get_logits(



0 [D loss: 0.717697, acc.: 48.00%] [G loss: 0.799186]
100 [D loss: 0.679923, acc.: 48.50%] [G loss: 0.635298]
200 [D loss: 0.600365, acc.: 80.50%] [G loss: 0.845760]
300 [D loss: 0.485292, acc.: 88.00%] [G loss: 1.131266]
400 [D loss: 0.708448, acc.: 51.50%] [G loss: 0.723726]
500 [D loss: 0.545595, acc.: 78.00%] [G loss: 0.807405]
600 [D loss: 0.429261, acc.: 92.00%] [G loss: 1.249392]
700 [D loss: 0.731408, acc.: 48.00%] [G loss: 0.724719]
800 [D loss: 0.726296, acc.: 56.00%] [G loss: 0.804484]
900 [D loss: 0.678559, acc.: 52.50%] [G loss: 0.710721]
1000 [D loss: 0.766765, acc.: 42.00%] [G loss: 0.644769]
1100 [D loss: 0.758578, acc.: 50.00%] [G loss: 0.704206]
1200 [D loss: 0.696809, acc.: 49.00%] [G loss: 0.786539]
1300 [D loss: 0.653385, acc.: 59.00%] [G loss: 0.767821]
1400 [D loss: 0.487984, acc.: 92.50%] [G loss: 0.868770]
1500 [D loss: 0.838662, acc.: 21.00%] [G loss: 0.600395]
1600 [D loss: 0.533594, acc.: 78.00%] [G loss: 0.992828]
1700 [D loss: 0.769054, acc.: 49.00%] [G lo

  saving_api.save_model(


Model trained.




Created 1/20 synthetic data sets.
Created 2/20 synthetic data sets.
Created 3/20 synthetic data sets.
Created 4/20 synthetic data sets.
Created 5/20 synthetic data sets.
Created 6/20 synthetic data sets.
Created 7/20 synthetic data sets.
Created 8/20 synthetic data sets.
Created 9/20 synthetic data sets.
Created 10/20 synthetic data sets.
Created 11/20 synthetic data sets.
Created 12/20 synthetic data sets.
Created 13/20 synthetic data sets.
Created 14/20 synthetic data sets.
Created 15/20 synthetic data sets.
Created 16/20 synthetic data sets.
Created 17/20 synthetic data sets.
Created 18/20 synthetic data sets.
Created 19/20 synthetic data sets.
Created 20/20 synthetic data sets.


  output, from_logits = _get_logits(


0 [D loss: 0.712377, acc.: 37.50%] [G loss: 0.687657]
100 [D loss: 0.769662, acc.: 38.00%] [G loss: 0.587318]
200 [D loss: 0.692474, acc.: 45.50%] [G loss: 0.623007]
300 [D loss: 0.539679, acc.: 88.00%] [G loss: 0.889849]
400 [D loss: 0.644386, acc.: 59.50%] [G loss: 0.884852]
500 [D loss: 0.606272, acc.: 74.50%] [G loss: 0.910845]
600 [D loss: 1.081889, acc.: 27.00%] [G loss: 0.471749]
700 [D loss: 0.536413, acc.: 85.00%] [G loss: 0.927639]
800 [D loss: 0.720845, acc.: 63.00%] [G loss: 0.889657]
900 [D loss: 0.709634, acc.: 54.50%] [G loss: 0.679301]
1000 [D loss: 0.706853, acc.: 61.50%] [G loss: 0.737439]
1100 [D loss: 0.711189, acc.: 60.50%] [G loss: 0.752824]
1200 [D loss: 0.784429, acc.: 37.50%] [G loss: 0.699633]
1300 [D loss: 0.600872, acc.: 71.00%] [G loss: 0.718808]
1400 [D loss: 0.667406, acc.: 60.00%] [G loss: 0.775057]
1500 [D loss: 0.672801, acc.: 66.00%] [G loss: 0.884597]
1600 [D loss: 0.815819, acc.: 46.50%] [G loss: 0.727503]
1700 [D loss: 0.749009, acc.: 44.00%] [G lo

  saving_api.save_model(


Model trained.




Created 1/20 synthetic data sets.
Created 2/20 synthetic data sets.
Created 3/20 synthetic data sets.
Created 4/20 synthetic data sets.
Created 5/20 synthetic data sets.
Created 6/20 synthetic data sets.
Created 7/20 synthetic data sets.
Created 8/20 synthetic data sets.
Created 9/20 synthetic data sets.
Created 10/20 synthetic data sets.
Created 11/20 synthetic data sets.
Created 12/20 synthetic data sets.
Created 13/20 synthetic data sets.
Created 14/20 synthetic data sets.
Created 15/20 synthetic data sets.
Created 16/20 synthetic data sets.
Created 17/20 synthetic data sets.
Created 18/20 synthetic data sets.
Created 19/20 synthetic data sets.
Created 20/20 synthetic data sets.


  output, from_logits = _get_logits(


0 [D loss: 0.496136, acc.: 76.50%] [G loss: 0.698081]
100 [D loss: 0.753981, acc.: 42.50%] [G loss: 0.549714]
200 [D loss: 0.699537, acc.: 50.50%] [G loss: 0.689856]
300 [D loss: 0.667475, acc.: 75.50%] [G loss: 0.832108]
400 [D loss: 0.409153, acc.: 93.50%] [G loss: 1.164187]
500 [D loss: 0.515485, acc.: 82.00%] [G loss: 1.062199]
600 [D loss: 0.864151, acc.: 37.50%] [G loss: 0.713775]
700 [D loss: 0.573642, acc.: 79.00%] [G loss: 0.820848]
800 [D loss: 0.758965, acc.: 66.50%] [G loss: 0.899719]
900 [D loss: 0.490529, acc.: 85.00%] [G loss: 0.874714]
1000 [D loss: 0.771344, acc.: 32.50%] [G loss: 0.658767]
1100 [D loss: 0.701541, acc.: 62.50%] [G loss: 0.751809]
1200 [D loss: 0.873072, acc.: 43.50%] [G loss: 0.750356]
1300 [D loss: 0.508142, acc.: 77.00%] [G loss: 0.913487]
1400 [D loss: 0.548437, acc.: 84.50%] [G loss: 0.845491]
1500 [D loss: 0.790354, acc.: 57.50%] [G loss: 0.819828]
1600 [D loss: 0.673702, acc.: 78.00%] [G loss: 0.947754]
1700 [D loss: 0.707852, acc.: 61.50%] [G lo

  saving_api.save_model(


Model trained.




Created 1/20 synthetic data sets.
Created 2/20 synthetic data sets.
Created 3/20 synthetic data sets.
Created 4/20 synthetic data sets.
Created 5/20 synthetic data sets.
Created 6/20 synthetic data sets.
Created 7/20 synthetic data sets.
Created 8/20 synthetic data sets.
Created 9/20 synthetic data sets.
Created 10/20 synthetic data sets.
Created 11/20 synthetic data sets.
Created 12/20 synthetic data sets.
Created 13/20 synthetic data sets.
Created 14/20 synthetic data sets.
Created 15/20 synthetic data sets.
Created 16/20 synthetic data sets.
Created 17/20 synthetic data sets.
Created 18/20 synthetic data sets.
Created 19/20 synthetic data sets.
Created 20/20 synthetic data sets.


  output, from_logits = _get_logits(


0 [D loss: 0.888432, acc.: 41.50%] [G loss: 0.786200]
100 [D loss: 0.709028, acc.: 30.50%] [G loss: 0.588364]
200 [D loss: 0.476131, acc.: 98.00%] [G loss: 0.940437]
300 [D loss: 0.660891, acc.: 53.50%] [G loss: 0.703461]
400 [D loss: 0.851783, acc.: 5.00%] [G loss: 0.606419]
500 [D loss: 0.738596, acc.: 40.00%] [G loss: 0.607260]
600 [D loss: 0.657953, acc.: 59.00%] [G loss: 0.692789]
700 [D loss: 0.718753, acc.: 46.50%] [G loss: 0.656974]
800 [D loss: 0.600192, acc.: 84.00%] [G loss: 0.816476]
900 [D loss: 0.688377, acc.: 56.50%] [G loss: 0.707166]
1000 [D loss: 0.734971, acc.: 52.50%] [G loss: 0.650678]
1100 [D loss: 0.660259, acc.: 50.00%] [G loss: 0.832685]
1200 [D loss: 0.676727, acc.: 59.00%] [G loss: 0.715964]
1300 [D loss: 0.673043, acc.: 57.50%] [G loss: 0.761562]
1400 [D loss: 0.736519, acc.: 31.50%] [G loss: 0.655225]
1500 [D loss: 0.576020, acc.: 75.50%] [G loss: 0.807433]
1600 [D loss: 0.725970, acc.: 43.00%] [G loss: 0.693479]
1700 [D loss: 0.597486, acc.: 82.00%] [G los

  saving_api.save_model(


Model trained.




Created 1/20 synthetic data sets.
Created 2/20 synthetic data sets.
Created 3/20 synthetic data sets.
Created 4/20 synthetic data sets.
Created 5/20 synthetic data sets.
Created 6/20 synthetic data sets.
Created 7/20 synthetic data sets.
Created 8/20 synthetic data sets.
Created 9/20 synthetic data sets.
Created 10/20 synthetic data sets.
Created 11/20 synthetic data sets.
Created 12/20 synthetic data sets.
Created 13/20 synthetic data sets.
Created 14/20 synthetic data sets.
Created 15/20 synthetic data sets.
Created 16/20 synthetic data sets.
Created 17/20 synthetic data sets.
Created 18/20 synthetic data sets.
Created 19/20 synthetic data sets.
Created 20/20 synthetic data sets.


  output, from_logits = _get_logits(


0 [D loss: 0.522558, acc.: 55.00%] [G loss: 0.577350]
100 [D loss: 0.729107, acc.: 42.50%] [G loss: 0.553248]
200 [D loss: 0.613371, acc.: 62.00%] [G loss: 0.635321]
300 [D loss: 0.629466, acc.: 80.00%] [G loss: 0.782421]
400 [D loss: 0.580263, acc.: 80.50%] [G loss: 0.860062]
500 [D loss: 0.709631, acc.: 28.50%] [G loss: 0.676202]
600 [D loss: 0.693739, acc.: 60.50%] [G loss: 0.711263]
700 [D loss: 0.732820, acc.: 38.50%] [G loss: 0.649911]
800 [D loss: 0.737175, acc.: 36.00%] [G loss: 0.631370]
900 [D loss: 0.655101, acc.: 61.00%] [G loss: 0.826390]
1000 [D loss: 0.655214, acc.: 63.00%] [G loss: 0.810637]
1100 [D loss: 0.719417, acc.: 38.50%] [G loss: 0.639939]
1200 [D loss: 0.670894, acc.: 68.50%] [G loss: 0.693543]
1300 [D loss: 0.716413, acc.: 49.50%] [G loss: 0.693027]
1400 [D loss: 0.694930, acc.: 47.50%] [G loss: 0.687037]
1500 [D loss: 0.705417, acc.: 47.50%] [G loss: 0.682033]
1600 [D loss: 0.699486, acc.: 45.50%] [G loss: 0.691056]
1700 [D loss: 0.643723, acc.: 80.00%] [G lo

  saving_api.save_model(


Model trained.




Created 1/20 synthetic data sets.
Created 2/20 synthetic data sets.
Created 3/20 synthetic data sets.
Created 4/20 synthetic data sets.
Created 5/20 synthetic data sets.
Created 6/20 synthetic data sets.
Created 7/20 synthetic data sets.
Created 8/20 synthetic data sets.
Created 9/20 synthetic data sets.
Created 10/20 synthetic data sets.
Created 11/20 synthetic data sets.
Created 12/20 synthetic data sets.
Created 13/20 synthetic data sets.
Created 14/20 synthetic data sets.
Created 15/20 synthetic data sets.
Created 16/20 synthetic data sets.
Created 17/20 synthetic data sets.
Created 18/20 synthetic data sets.
Created 19/20 synthetic data sets.
Created 20/20 synthetic data sets.


Save synthetic data sets.

In [12]:
synthetic_data_path = "../../Data/Criteo/"
epsilons = ["13", "3", "1", "05", "005"]

for e, item in enumerate(all_synthetic_datasets.items()):
    sXs = item[1]
    if not os.path.exists(synthetic_data_path):
        os.makedirs(synthetic_data_path)
    for i, X in enumerate(sXs):
        X.to_csv(synthetic_data_path + "dpgan_" + epsilons[e] + "_" + str(i) + "_" + data_set + ".csv", index=False)

End of file.