In this notebook, we generate all synthetic dataset that are _not_ tailored to a specific task. This notebook includes code to generate the datasets with the right parameters (including seeds) and save the results.

In [1]:
import json
import numpy as np
import os
import pandas as pd
import tqdm
import tqdm.notebook

In [2]:
# Loads of warnings coming from the sub-methods, which we don't care about.
import warnings
warnings.filterwarnings("ignore")

In [3]:
from reprosyn.methods import IPF, MST, CTGAN, PATEGAN, PRIVBAYES, DS_PRIVBAYES, SYNTHPOP

2022-11-03 13:40:33.700641: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
from utils import load_data, metadata, run_generator, target_folder

#### Parameters

In [5]:
master_seed = 42387342

In [6]:
num_runs = 5

#### Data

Generate synthetic datasets from the "training" part of the data.

In [7]:
df = load_data(train=True, test=False)

We want to generate datasets with the same size as the *full* data.

In [8]:
output_size = len(load_data(train=True, test=True))

In [9]:
# TODO, maybe: restrict columns?

#### Methods

In [10]:
methods = [
#     ("MST_eps1", MST, {"epsilon": 1}),
    ("MST_eps1000", MST, {"epsilon": 1000}),
    ("CTGAN_10epochs", CTGAN, {"epochs": 10}),
#     ("PATEGAN_eps1", PATEGAN, {"epsilon": 1}),
    ("PATEGAN_eps1000", PATEGAN, {"epsilon": 1000}),
#     ("PrivBayes_eps1", DS_PRIVBAYES, {"epsilon": 1}),
    ("PrivBayes_eps1000", DS_PRIVBAYES, {"epsilon": 1000}),
    ("SYNTHPOP", SYNTHPOP, {}),
]

#### Configuration

In [11]:
np.random.seed(master_seed)
seeds = np.random.randint(np.iinfo(np.int16).max, size=num_runs)

#### Runs

In [12]:
for method_name, generator, kwargs in methods:
    print('='*10, method_name, '='*10)
    target_folder_method = f"{target_folder}/{method_name}"
    if not os.path.exists(target_folder_method):
        os.mkdir(target_folder_method)
    for seed in tqdm.notebook.tqdm(seeds):
        run_generator(df, method_name, seed, generator, output_size, kwargs)
    print()



  0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/5 [00:00<?, ?it/s]

2022-11-03 14:35:03.037279: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-03 14:35:06.717084: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled





  0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/5 [00:00<?, ?it/s]

train_Region
train_Residence Type
train_Family Composition
train_Population Base
train_Sex
train_Age
train_Marital Status
train_Student
train_Country of Birth
train_Health
train_Ethnic Group
train_Religion
train_Economic Activity
train_Occupation
train_Industry
train_Hours worked per week
train_Approximated Social Grade
generate_Region
generate_Residence Type
generate_Family Composition
generate_Population Base
generate_Sex
generate_Age
generate_Marital Status
generate_Student
generate_Country of Birth
generate_Health
generate_Ethnic Group
generate_Religion
generate_Economic Activity
generate_Occupation
generate_Industry
generate_Hours worked per week
generate_Approximated Social Grade
train_Region
train_Residence Type
train_Family Composition
train_Population Base
train_Sex
train_Age
train_Marital Status
train_Student
train_Country of Birth
train_Health
train_Ethnic Group
train_Religion
train_Economic Activity
train_Occupation
train_Industry
train_Hours worked per week
train_Approxima