In [1]:
from sklearn.datasets import load_diabetes
import pickle
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import os
import torch


from synthcity.metrics.eval_performance import (
    PerformanceEvaluatorMLP,
    PerformanceEvaluatorXGB,
)
from synthcity.utils import reproducibility
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader
from DGE_utils import metric_different_datasets, mean_across_pandas, add_std, get_folder_names

reproducibility.clear_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


assert device.type == 'cuda'
Plugins(categories=["generic"]).list()


  from .autonotebook import tqdm as notebook_tqdm


['sdv_tvae',
 'privbayes',
 'gaussian_copula',
 'copulagan',
 'ctgan',
 'pategan',
 'nflow',
 'sdv_ctgan',
 'bayesian_network',
 'adsgan',
 'rtvae',
 'dpgan',
 'tvae']

In [2]:
from DGE_data import get_real_and_synthetic

# let's restrict ourselves to classification datasets
datasets = ['moons', 'circles', 'breast_cancer',
            'adult',  'seer', 'covid']
# ['moons', 'circles','cal_housing', 'adult', 'diabetes', 'breast_cancer',  'seer', 'cutract' ]
model_name = 'ctgan'  # synthetic data model

p_train = 0.8  # proportion of training data for generative model. Default values if None
n_models = 20  # number of models in ensemble, for each run
max_n = 2000  # maximum number of data points to use for training generative model.
nsyn = 2000  # number of synthetic data points per synthetic dataset. Defaults to same as generative training size if None

num_runs = 10  # Number of runs. Don't choose to large, since total number of synthetic datasets is num_runs*n_models

load = True  # results
load_syn = True  # data
save = True  # save results and data

outlier = False

verbose = False

if nsyn is None:
    nsyn = max_n


In [5]:
from DGE_utils import get_folder_names, tt_predict_performance, cat_dl
from DGE_data import get_real_and_synthetic

import pandas as pd
from sklearn.model_selection import KFold


def full_exp(X_gt, X_syns, workspace_folder=None, results_folder=None, 
             save=True, load=True, task_type='mlp', 
            cross_fold=5, outlier=False, verbose=False):
    """Compares predictions by different approaches.

    Args:
        X_test (GenericDataLoader): Test data.
        X_syns (List(GenericDataLoader)): List of synthetic datasets.
        X_test (GenericDataLoader): Real data
        load (bool, optional): Load results, if available. Defaults to True.
        save (bool, optional): Save results when done. Defaults to True.

    Returns:

    """

    if save and results_folder is None:
        raise ValueError('results_folder must be specified when save=True.')

    X_test_r = X_gt.test()

    if type(outlier) == type(lambda x: 1):
        print('Using subset for evaluation')
        subset = outlier
        X_test_r = subset(X_test_r)
        plot = False
    elif outlier:
        raise ValueError('outlier boolean is no longer supported')

    X_test_r.targettype = X_gt.targettype

    if not X_gt.targettype in ['regression', 'classification']:
        raise ValueError('X_gt.targettype must be regression or classification.')

    # DGE (k=5, 10, 20)
    n_models = 20  # maximum K
    num_runs = len(X_syns)//n_models

    if num_runs > 1 and verbose:
        print('Computing means and stds')

    keys = ['Oracle', 'Naive', 'DGE$_{20]$', 'DGE$_{20}$ (concat)']
    #keys = keys[-2:]

    # Oracle
    X_oracle = X_gt.train()

    # Oracle ensemble
    scores_r_all = []
    scores_s_all = []

    for run in range(num_runs):

        run_label = f'run_{run}'
        starting_dataset = run*n_models
        scores_s = {}
        scores_r = {}

        for approach in keys:
            kf = KFold(n_splits=cross_fold, shuffle=True, random_state=0)
            print(approach)
            if 'oracle' in approach.lower():
                X_syn_run = X_oracle
            elif approach == 'Naive':
                X_syn_run = X_syns[run]
            elif approach.startswith('DGE') and not 'concat' in approach:
                K = 20
                X_syn_run = X_syns[starting_dataset:starting_dataset+K]
            elif approach == 'DGE$_{20}$ (concat)':
                X_syn_cat = pd.concat([X_syns[i].dataframe() for i in range(
                    starting_dataset, starting_dataset+20)], axis=0)
            else:
                raise ValueError(f'Unknown approach {approach}')

            scores_s[approach] = [0] * cross_fold
            scores_r[approach] = [0] * cross_fold
            for i, (train_index, test_index) in enumerate(kf.split(X_syn_run)):
                
                if verbose:
                    print('Run', run, 'approach', approach, 'split', i)
                
                if type(X_syn_run) == type([]):
                    X_train = cat_dl([X_syn_run[i] for i in train_index])
                    X_test_s = cat_dl([X_syn_run[i] for i in test_index])
                else:
                    if type(X_syn_run)==pd.DataFrame:
                        pass
                    else:
                        X_syn_run = X_syn_run.dataframe()
                    
                    x_train, x_test = X_syn_run.loc[train_index], X_syn_run.loc[test_index]
                    X_train = GenericDataLoader(x_train, target_column="target")
                    X_test_s = GenericDataLoader(x_test, target_column="target")

                X_test_s.targettype = X_syns[0].targettype
                X_train.targettype = X_syns[0].targettype

                filename = os.path.join(
                    workspace_folder, f'cross_validation_{task_type}_{approach}_{run_label}_split_{i}.pkl')

                if load and os.path.exists(filename):
                    with open(filename, 'rb') as f:
                        model = pickle.load(f)
                else:
                    model = None
                scores_s[approach][i], model = tt_predict_performance(
                    X_test_s, X_train, model=model, model_type=task_type, subset=None, verbose=False)
                scores_r[approach][i], _ = tt_predict_performance(
                    X_test_r, X_train, model=model, model_type=task_type, subset=None, verbose=False)

                scores_s[approach][i]['run'] = run
                scores_r[approach][i]['run'] = run
                scores_s[approach][i]['split'] = i
                scores_r[approach][i]['split'] = i
                scores_s[approach][i]['approach'] = approach
                scores_r[approach][i]['approach'] = approach

                if save and not os.path.exists(filename):
                    with open(filename, 'wb') as f:
                        pickle.dump(model, f)

            scores_s[approach] = pd.concat(scores_s[approach], axis=0)
            scores_r[approach] = pd.concat(scores_r[approach], axis=0)

        scores_s_all.append(pd.concat(scores_s))
        scores_r_all.append(pd.concat(scores_r))

    scores_s_all = pd.concat(scores_s_all, axis=0)
    scores_r_all = pd.concat(scores_r_all, axis=0)

    scores_s_mean = scores_s_all.groupby(['run', 'approach']).mean()
    scores_r_mean = scores_r_all.groupby(['run', 'approach']).mean()

    return scores_s_mean, scores_r_mean


In [6]:
num_runs = 10
model_type = 'deepish_mlp'
model_name = 'ctgan_deep'
nsyn = 5000
max_n = 5000
p_train = 0.8
n_models = 20
cross_fold = 5
load_syn = True
load = True
save = True
verbose = True

scores_s_all = {}
scores_r_all = {}


for dataset in ['moons', 'circles', 'breast_cancer', 'adult', 'covid', 'seer']:
    workspace_folder, results_folder = get_folder_names(
        dataset, model_name, max_n=max_n, nsyn=nsyn)

    X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
                                          p_train=p_train,
                                          n_models=n_models*num_runs,
                                          model_name=model_name,
                                          load_syn=load_syn,
                                          verbose=verbose,
                                          max_n=max_n,
                                          nsyn=nsyn)

    print(f'Dataset {dataset}\n')

    scores_s, scores_r = full_exp(X_gt,
                                  X_syns,
                                  workspace_folder=workspace_folder,
                                  results_folder=results_folder,
                                  save=save,
                                  load=load,
                                  task_type=model_type,
                                  cross_fold=cross_fold,
                                  verbose=verbose
                                  )

    scores_s_all[dataset] = scores_s
    scores_r_all[dataset] = scores_r


n_total 10000 n_train: 5000
Dataset moons

Computing means and stds
Oracle
Run 0 approach Oracle split 0
Run 0 approach Oracle split 1
Run 0 approach Oracle split 2
Run 0 approach Oracle split 3
Run 0 approach Oracle split 4
Naive
Run 0 approach Naive split 0
Run 0 approach Naive split 1
Run 0 approach Naive split 2
Run 0 approach Naive split 3
Run 0 approach Naive split 4
DGE$_{20]$
Run 0 approach DGE$_{20]$ split 0
Run 0 approach DGE$_{20]$ split 1
Run 0 approach DGE$_{20]$ split 2
Run 0 approach DGE$_{20]$ split 3
Run 0 approach DGE$_{20]$ split 4
DGE$_{20}$ (concat)
Run 0 approach DGE$_{20}$ (concat) split 0
Run 0 approach DGE$_{20}$ (concat) split 1
Run 0 approach DGE$_{20}$ (concat) split 2
Run 0 approach DGE$_{20}$ (concat) split 3
Run 0 approach DGE$_{20}$ (concat) split 4
Oracle
Run 1 approach Oracle split 0
Run 1 approach Oracle split 1
Run 1 approach Oracle split 2
Run 1 approach Oracle split 3
Run 1 approach Oracle split 4
Naive
Run 1 approach Naive split 0
Run 1 approach N

In [None]:
for dataset in datasets:
    print(dataset)
    print(scores_s_all[dataset].groupby('approach').mean())
    print(scores_r_all[dataset].groupby('approach').mean())

                              AUC      Acc        F1  Precision    Recall  \
run approach                                                                
0   DGE$_{20}$           0.915388  0.84555  0.754098   0.770402  0.757814   
    DGE$_{20}$ (concat)  0.912962  0.84878  0.759555   0.769754  0.762048   

                              NLL     Brier  split  
run approach                                        
0   DGE$_{20}$           0.355513  0.108364    2.0  
    DGE$_{20}$ (concat)  0.351100  0.107674    2.0  
                              AUC       Acc        F1  Precision    Recall  \
run approach                                                                 
0   DGE$_{20}$           0.900519  0.814933  0.804671   0.852127  0.763093   
    DGE$_{20}$ (concat)  0.898077  0.814333  0.804340   0.850453  0.763307   

                              NLL     Brier  split  
run approach                                        
0   DGE$_{20}$           0.425074  0.131404    2.0  
    DGE