# Import modules

In [1]:
from sklearn.datasets import load_diabetes
import pickle
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import os
import torch


from synthcity.metrics.eval_performance import (
    PerformanceEvaluatorMLP,
    PerformanceEvaluatorXGB,
)
from synthcity.utils import reproducibility
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader
from DGE_utils import metric_different_datasets, mean_across_pandas, add_std, get_folder_names

reproducibility.clear_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Plugins(categories=["generic"]).list()

assert device.type == 'cuda'


  from .autonotebook import tqdm as notebook_tqdm


# Load data and settings

In [2]:
from DGE_data import get_real_and_synthetic

# let's restrict ourselves to classification datasets
datasets = ['moons', 'circles', #'gaussian',
            'adult',  'seer']
# ['moons', 'circles','cal_housing', 'adult', 'diabetes', 'breast_cancer',  'seer', 'cutract' ]
model_name = 'ctgan_deep'  # synthetic data model

p_train = 0.8  # proportion of training data for generative model. Default values if None
n_models = 20  # number of models in ensemble, for each run
max_n = 2000  # maximum number of data points to use for training generative model.
nsyn = 2000  # number of synthetic data points per synthetic dataset. Defaults to same as generative training size if None

num_runs = 10 # Number of runs. Don't choose to large, since total number of synthetic datasets is num_runs*n_models

load = True  # results
load_syn = True  # data
save = True  # save results and data

outlier = True

verbose = False

if nsyn is None:
    nsyn = max_n


In [3]:
for dataset in ['moons', 'circles']:
    print('Dataset:', dataset)
    workspace_folder, results_folder = get_folder_names(
        dataset, model_name, max_n=max_n, nsyn=nsyn)

    X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
                                          p_train=p_train,
                                          n_models=num_runs*n_models,
                                          model_name=model_name,
                                          load_syn=load_syn,
                                          verbose=verbose,
                                          max_n=max_n,
                                          nsyn=nsyn)

    print('Shape of each synthetic dataset:', X_syns[0].shape)
    print('Target type:', X_gt.targettype)



Dataset: covid
n_total 20000 n_train: 2000


100%|██████████| 2000/2000 [11:41<00:00,  2.85it/s]
100%|██████████| 2000/2000 [10:56<00:00,  3.05it/s]
100%|██████████| 2000/2000 [10:55<00:00,  3.05it/s]
 65%|██████▍   | 1293/2000 [07:03<04:10,  2.82it/s]

# Experiments

## Model Training

In [4]:
from DGE_experiments import predictive_experiment
import pandas as pd


all_means = {}
all_stds = {}
dataset_type = 'classification'
model_name = 'ctgan'

for dataset in datasets:
    workspace_folder, results_folder = get_folder_names(
        dataset, model_name, max_n=max_n, nsyn=nsyn)

    X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
                                          p_train=p_train,
                                          n_models=n_models*num_runs,
                                          model_name=model_name,
                                          load_syn=load_syn,
                                          verbose=verbose,
                                          max_n=max_n,
                                          nsyn=nsyn)
    if X_gt.targettype != dataset_type:
        continue

    print(f'Dataset {dataset}\n')

    means, stds = predictive_experiment(X_gt,
                                            X_syns,
                                            workspace_folder=workspace_folder,
                                            results_folder=results_folder,
                                            save=save,
                                            load=load,
                                            plot=True,
                                            outlier=outlier
                                            )

    print(means.to_latex())

    all_means[dataset] = means
    all_stds[dataset] = stds



n_total 10000 n_train: 2000
Dataset moons

\begin{tabular}{lrrrrrrr}
\toprule
{} &     AUC &     Acc &      F1 &  Precision &  Recall &     NLL &   Brier \\
\midrule
DGE\$\_20\$         &  0.9802 &  0.9177 &  0.9183 &     0.9122 &  0.9249 &  0.1979 &  0.0580 \\
DGE\$\_10\$         &  0.9796 &  0.9171 &  0.9176 &     0.9122 &  0.9232 &  0.1997 &  0.0588 \\
DGE\$\_5\$          &  0.9782 &  0.9140 &  0.9143 &     0.9131 &  0.9159 &  0.2053 &  0.0607 \\
Naive (single)   &  0.9783 &  0.9164 &  0.9171 &     0.9115 &  0.9238 &  0.2023 &  0.0601 \\
Naive (ensemble) &  0.9773 &  0.9145 &  0.9149 &     0.9084 &  0.9230 &  0.2060 &  0.0614 \\
Oracle           &  0.9960 &  0.9693 &  0.9691 &     0.9699 &  0.9691 &  0.0848 &  0.0240 \\
\bottomrule
\end{tabular}

n_total 10000 n_train: 2000
Dataset circles

\begin{tabular}{lrrrrrrr}
\toprule
{} &     AUC &     Acc &      F1 &  Precision &  Recall &     NLL &   Brier \\
\midrule
DGE\$\_20\$         &  0.8654 &  0.7588 &  0.7233 &     0.8479 &  0.6311

100%|██████████| 2000/2000 [12:09<00:00,  2.74it/s]
100%|██████████| 2000/2000 [12:43<00:00,  2.62it/s]
100%|██████████| 2000/2000 [12:45<00:00,  2.61it/s]
100%|██████████| 2000/2000 [12:27<00:00,  2.67it/s]
100%|██████████| 2000/2000 [12:41<00:00,  2.63it/s]
 26%|██▌       | 517/2000 [03:16<09:24,  2.63it/s]


KeyboardInterrupt: 

In [None]:
mean_across_pandas(all_means)

means_consolidated = metric_different_datasets(all_means)
if num_runs>1:
    stds_consolidated = metric_different_datasets(all_stds)
    print(add_std(means_consolidated, stds_consolidated).to_latex())
else:
    print(means_consolidated.to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
{} &   AUC &   Acc &    F1 &  Precision &  Recall &   NLL &  Brier \\
\midrule
DGE (k=10)              & 0.908 & 0.836 & 0.788 &      0.823 &   0.761 & 0.364 &  0.115 \\
DGE (k=20)              & 0.910 & 0.836 & 0.786 &      0.826 &   0.756 & 0.363 &  0.115 \\
DGE (k=5)               & 0.908 & 0.833 & 0.787 &      0.819 &   0.762 & 0.367 &  0.116 \\
Naive (ensemble) max    & 0.908 & 0.843 & 0.804 &      0.856 &   0.835 & 0.453 &  0.146 \\
Naive (ensemble) mean   & 0.892 & 0.819 & 0.767 &      0.799 &   0.745 & 0.392 &  0.125 \\
Naive (ensemble) median & 0.897 & 0.822 & 0.773 &      0.803 &   0.748 & 0.384 &  0.123 \\
Naive (ensemble) min    & 0.861 & 0.786 & 0.716 &      0.733 &   0.657 & 0.354 &  0.111 \\
Naive (ensemble) std    & 0.014 & 0.016 & 0.025 &      0.035 &   0.051 & 0.028 &  0.010 \\
Naive (single) max      & 0.908 & 0.843 & 0.804 &      0.855 &   0.839 & 0.456 &  0.146 \\
Naive (single) mean     & 0.892 & 0.819 & 0.767 &      0.798 &   0.

Unnamed: 0,Moons,Circles,Adult Income,SEER,CUTRACT,Mean
DGE (k=20),0.981,0.863,0.893,0.907,0.907,0.9102
DGE (k=10),0.98,0.855,0.892,0.907,0.907,0.9082
DGE (k=5),0.979,0.863,0.887,0.907,0.906,0.9084
Oracle,0.996,0.868,0.889,0.911,0.911,0.915
Naive (single) median,0.9805,0.825,0.8675,0.904,0.904,0.8962
Naive (single) mean,0.97875,0.8069,0.8669,0.9025,0.90245,0.8915
Naive (single) std,0.00816,0.047436,0.006534,0.003058,0.003074,0.013652
Naive (single) min,0.956,0.707,0.848,0.897,0.897,0.861
Naive (single) max,0.988,0.862,0.878,0.906,0.906,0.908
Naive (ensemble) median,0.9805,0.825,0.87,0.904,0.904,0.8967


## Model Evaluation

We compare the single baseline model vs the generative uncertainty model vs an oracle. Workflow.
0. Train and generate synthetic datasets $S_i$.
1. Take each synthetic dataset $S_i$ and split it up in train and test.
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic dataset's test set $S_{i,test}$, giving $\hat{M}^S_i$ [Single performance]
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $M_i$ [Oracle performance]
5. Evaluate on the other synthetic datasets $\cup_{j\neq i} S_{j}$, giving $\hat{M}^G_i$ [Generative performance]
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average over all models $f_i$. 
7. Repeat 1-6 for different model classes $f$

N.B. the idea of the above, is that the trained model $f_i$ is the same for each evaluation type. In the model selection section, we will compare the performance of different model classes, where we will train a new model for each evaluation type (hence the aim is to evaluate which class is best, while the model itself may vary).

In [None]:
from DGE_experiments import model_evaluation_experiment


evaluation_means = {}
evaluation_std = {}
relative = False
max_n = 5000
nsyn = 5000
model_name = 'ctgan_deep'
datasets = ['moons', 'circles', 'adult', 'seer']

for dataset in datasets:
    print('Dataset:', dataset)
    workspace_folder, results_folder = get_folder_names(
        dataset, model_name, max_n=max_n, nsyn=nsyn)


    

    # load data
    X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
                                          p_train=p_train,
                                          n_models=n_models,
                                          model_name=model_name,
                                          load_syn=load_syn,
                                          verbose=verbose,
                                          max_n=max_n,
                                          nsyn=nsyn)

    # get mean and std of dataset over different runs
    means, std = model_evaluation_experiment(X_gt, X_syns, workspace_folder=workspace_folder, relative=relative,
                                             model_type='deepish_mlp',
                                             load=load,
                                             save=load,
                                             verbose=verbose,
                                             outlier=outlier,
                                             )

    evaluation_means[dataset] = means
    evaluation_std[dataset] = std


Dataset: moons
n_total 10000 n_train: 5000
Dataset: circles
n_total 10000 n_train: 5000
Dataset: adult
n_total 32561 n_train: 5000
Dataset: seer
n_total 20000 n_train: 5000
Dataset: cutract
n_total 20000 n_train: 5000


In [None]:
# mean across datasets
# mean_across_pandas(evaluation_means)
# per dataset
metric = 'Acc'
res = metric_different_datasets(evaluation_means, metric=metric, to_print=False)
std_df = metric_different_datasets(evaluation_std, metric=metric, to_print=False)

del std_df['Mean']
res = add_std(res, std_df)
if relative != 'l2':
    print(res.to_latex(float_format=lambda x: '%.3f' % x))
else:
    print(res.to_latex(float_format=lambda x: '%.5f' % x))

\begin{tabular}{lllllll}
\toprule
{} &          Moons &        Circles &   Adult Income &           SEER &        CUTRACT &   Mean \\
\midrule
Oracle     &   0.775 ± 0.14 &  0.508 ± 0.036 &  0.785 ± 0.015 &  0.711 ± 0.108 &  0.711 ± 0.108 &  0.698 \\
Naive      &  0.892 ± 0.072 &  0.819 ± 0.132 &  0.784 ± 0.028 &  0.877 ± 0.061 &  0.877 ± 0.061 &   0.85 \\
DGE (K=5)  &  0.703 ± 0.132 &   0.518 ± 0.07 &   0.773 ± 0.01 &  0.743 ± 0.129 &  0.743 ± 0.129 &  0.696 \\
DGE (K=10) &  0.744 ± 0.139 &  0.522 ± 0.094 &   0.774 ± 0.01 &  0.772 ± 0.088 &  0.772 ± 0.088 &  0.717 \\
DGE (K=20) &  0.753 ± 0.138 &  0.506 ± 0.045 &   0.775 ± 0.01 &  0.769 ± 0.069 &  0.769 ± 0.069 &  0.714 \\
\bottomrule
\end{tabular}



## Model Selection
Essentially repeat the above for different models

In [None]:
from DGE_experiments import model_selection_experiment

# load data
max_n = 2000
nsyn = 2000
model_name = 'ctgan_deep'
dataset = 'seer'
X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
                                        p_train=p_train,
                                        n_models=n_models,
                                        model_name=model_name,
                                        load_syn=load_syn,
                                        verbose=verbose,
                                        max_n=max_n,
                                        nsyn=nsyn)

if X_gt.targettype is not None:
    if X_gt.targettype == 'classification':
        metric = 'AUC'
    elif X_gt.targettype == 'regression':
        metric = 'MAE'

    

    workspace_folder, results_folder = get_folder_names(
        dataset, model_name, max_n=max_n, nsyn=nsyn)


    
    

    means_sorted, std = model_selection_experiment(X_gt, X_syns, relative=False,
                                                       workspace_folder=workspace_folder, 
                                                       load=load, save=save)




n_total 20000 n_train: 2000


In [None]:
metric = 'AUC'
pickle.dump(means_sorted, open('seer_auc.p', 'wb'))
print(means_sorted[metric])
print(add_std(means_sorted[metric].iloc[:5], std[metric]).to_latex())

                 deep_mlp    knn  xgboost     rf    svm     lr    mlp
Oracle              0.860  0.864    0.868  0.873  0.874  0.894  0.897
Naive               0.869  0.876    0.888  0.892  0.893  0.903  0.909
DGE (K=5)           0.840  0.848    0.862  0.866  0.861  0.882  0.885
DGE (K=10)          0.846  0.854    0.866  0.871  0.868  0.885  0.889
DGE (K=20)          0.836  0.844    0.855  0.861  0.857  0.877  0.881
Oracle rank         1.000  2.000    3.000  4.000  5.000  6.000  7.000
Naive rank          1.000  2.000    3.000  4.000  5.000  6.000  7.000
DGE (K=5) rank      1.000  2.000    4.000  5.000  3.000  6.000  7.000
DGE (K=10) rank     1.000  2.000    3.000  5.000  4.000  6.000  7.000
DGE (K=20) rank     1.000  2.000    3.000  5.000  4.000  6.000  7.000
\begin{tabular}{llllllll}
\toprule
{} &       deep\_mlp &            knn &        xgboost &             rf &            svm &             lr &            mlp \\
\midrule
Oracle     &   0.86 ± 0.022 &  0.864 ± 0.018 &   0.868 ± 0.0

## Influence of synthetic data size

Let's study the effect of synthetic data size.

In [None]:
from DGE_experiments import predictive_varying_nsyn

predictive_varying_nsyn(X_gt, X_syns, dataset, model_name,
                        nsyn, results_folder, workspace_folder, load=load, save=save, verbose=True)


## Density estimation


In [None]:
# from DGE_experiments import density_experiment

# if X_gt.targettype is None:
#    density_experiment(X_gt, X_syns, load, save)


# Cross validation

We compare the single baseline model vs the generative uncertainty model. Single workflow
1. Take each synthetic dataset $S_i$ and split it up in train and test.
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic datasets test set $S_{i,test}$
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $\hat{M}_i$
5. Average results across the different synthetic datasets, giving $M_i$.
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average.

Versus our baseline
1. Take each synthetic dataset $S_i$ and split it up in train and test
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic datasets test set $S_{i,test}
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $\hat{M}_i$
5. Average results across the different synthetic datasets, giving $M_i$.
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average.


Cross-validation approach to test which type of model would perform best on real data. We compare the single baseline model vs the generative uncertainty model vs an oracle. Workflow Cross-validation.
0. Train and generate synthetic datasets $S_i$.
1. Use CV to train and evaluate models $f_i$ on each $S_i$. Repeat for all $S_i$. [Single performance]
2. Use CV \textit{over datasets $S_i$} (i.e. train on all but one $S_i$, evaluate on remaining and repeat) to train and evaluate models $f_i$.
3. For both cases, evaluate the model also on the true real test set (oracle), $D_{test}$, giving $M_i$ [Oracle performance]
4. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average over all models $f_i$. 