# Import modules

In [100]:
from sklearn.datasets import load_diabetes
import pickle
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import os
import torch


from synthcity.metrics.eval_performance import (
    PerformanceEvaluatorMLP,
    PerformanceEvaluatorXGB,
)
from synthcity.utils import reproducibility
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader


reproducibility.clear_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Plugins(categories=["generic"]).list()

assert device.type == 'cuda'


# Reload

In [101]:
import importlib
import DGE_experiments
import DGE_data
import DGE_utils
importlib.reload(DGE_experiments)
importlib.reload(DGE_data)
importlib.reload(DGE_utils)


<module 'DGE_utils' from '/home/bv292/synthcity/DGE_utils.py'>

# Load data and settings

In [108]:
from DGE_data import get_real_and_synthetic

dataset = ['moons', 'cal_housing', 'seer'][1]  # real data
model_name = 'ctgan'  # synthetic data model

nsyn = None  # number of synthetic data points per synthetic dataset. Defaults to same as generative training size if None
p_train = None  # proportion of training data for generative model. Default values if None
n_models = 20  # number of models in ensemble
load = True  # results

load_syn = True  # data
save = True  # save results and data
verbose = False


workspace_folder = os.path.join("workspace", dataset,model_name)
results_folder = os.path.join("uncertainty_results",dataset,model_name)

X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
                                      p_train=p_train,
                                      n_models=n_models,
                                      model_name=model_name,
                                      load_syn=load_syn,
                                      verbose=verbose)


nsyn = len(X_syns[0])
print('Shape of each synthetic dataset:', X_syns[0].shape)


n_total 20640 n_train: 4128
Shape of each synthetic dataset (4128, 9)


# Analyses

## Model Training

In [103]:
from DGE_experiments import predictive_experiment

y_preds, scores = predictive_experiment(X_gt,
                      X_syns,
                      workspace_folder=workspace_folder,
                      results_folder=results_folder,
                      save=save,
                      load=load)

scores


Unnamed: 0,r2,mse,mae
DGE (k=20),0.343863,0.879087,0.682246
DGE (k=10),0.20278,1.06811,0.694459
DGE (k=5),0.375684,0.836455,0.678394
Naive (single),0.343863,0.879087,0.682246
Naive (concat),-0.185865,1.588814,0.688961
Oracle,-0.185865,1.588814,0.688961


# Model Evaluation

We compare the single baseline model vs the generative uncertainty model vs an oracle. Workflow.
0. Train and generate synthetic datasets $S_i$.
1. Take each synthetic dataset $S_i$ and split it up in train and test.
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic dataset's test set $S_{i,test}$, giving $\hat{M}^S_i$ [Single performance]
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $M_i$ [Oracle performance]
5. Evaluate on the other synthetic datasets $\cup_{j\neq i} S_{j}$, giving $\hat{M}^G_i$ [Generative performance]
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average over all models $f_i$. 
7. Repeat 1-6 for different model classes $f$

N.B. the idea of the above, is that the trained model $f_i$ is the same for each evaluation type. In the model selection section, we will compare the performance of different model classes, where we will train a new model for each evaluation type (hence the aim is to evaluate which class is best, while the model itself may vary).

In [104]:
from DGE_experiments import model_evaluation_experiment

if X_gt.targettype is not None:
    print('Aggregate approaches:')
    res = model_evaluation_experiment(X_gt, X_syns, workspace_folder=workspace_folder, model_type='mlp')[0]
    print(res)


Aggregate approaches:
                     r2              mse           mae
oracle  -85.09 ± 139.01  247.98 ± 186.25  73.83 ± 8.31
naive      21.42 ± 16.9    95.61 ± 20.56  73.42 ± 9.05
dge       22.15 ± 18.12     95.0 ± 22.11  74.34 ± 9.16


# Model Selection
Essentially repeat the above for different models

In [105]:
from DGE_experiments import model_selection_experiment

if X_gt.targettype is not None:
    if X_gt.targettype == 'classification':
        metric = 'accuracy'
    elif X_gt.targettype == 'regression':
        metric = 'mse'
    
    results, means_sorted = model_selection_experiment(X_gt, X_syns, relative=False, metric=metric, workspace_folder=workspace_folder, load=load, save=save)


                  rf      xgboost           svm           knn            lr  \
oracle  66.29 ± 0.68  81.08 ± 0.0  145.73 ± 0.0  148.43 ± 0.0  200.72 ± 0.0   
naive   14.94 ± 0.93  10.45 ± 0.0   17.51 ± 0.0   22.32 ± 0.0  118.22 ± 0.0   
dge     41.05 ± 0.72  56.68 ± 0.0   19.43 ± 0.0   53.38 ± 0.0   118.5 ± 0.0   

               deep_mlp              mlp  
oracle  208.78 ± 331.57  247.98 ± 186.25  
naive    50.33 ± 102.49  152.91 ± 184.23  
dge      55.46 ± 116.19  153.43 ± 184.49  
                rf  xgboost     svm     knn      lr  deep_mlp     mlp
oracle       66.29    81.08  145.73  148.43  200.72    208.78  247.98
naive        14.94    10.45   17.51   22.32  118.22     50.33  152.91
DGE          41.05    56.68   19.43   53.38  118.50     55.46  153.43
oracle rank   1.00     2.00    3.00    4.00    5.00      6.00    7.00
naive rank    2.00     1.00    3.00    4.00    6.00      5.00    7.00
DGE rank      2.00     5.00    1.00    3.00    6.00      4.00    7.00


## Influence of synthetic data size

Let's study the effect of synthetic data size.

In [106]:
from DGE_experiments import predictive_varying_nsyn

predictive_varying_nsyn(X_gt, X_syns, dataset, model_name, n_models,
                        nsyn, results_folder, workspace_folder, load, save, verbose=True)


## Density estimation


In [107]:
from DGE_experiments import density_experiment

if X_gt.targettype is None:
    density_experiment(X_gt, X_syns, load, save)


# Model parameter estimation
Using a linear model

# Cross validation

We compare the single baseline model vs the generative uncertainty model. Single workflow
1. Take each synthetic dataset $S_i$ and split it up in train and test.
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic datasets test set $S_{i,test}$
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $\hat{M}_i$
5. Average results across the different synthetic datasets, giving $M_i$.
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average.

Versus our baseline
1. Take each synthetic dataset $S_i$ and split it up in train and test
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic datasets test set $S_{i,test}
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $\hat{M}_i$
5. Average results across the different synthetic datasets, giving $M_i$.
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average.


Cross-validation approach to test which type of model would perform best on real data. We compare the single baseline model vs the generative uncertainty model vs an oracle. Workflow Cross-validation.
0. Train and generate synthetic datasets $S_i$.
1. Use CV to train and evaluate models $f_i$ on each $S_i$. Repeat for all $S_i$. [Single performance]
2. Use CV \textit{over datasets $S_i$} (i.e. train on all but one $S_i$, evaluate on remaining and repeat) to train and evaluate models $f_i$.
3. For both cases, evaluate the model also on the true real test set (oracle), $D_{test}$, giving $M_i$ [Oracle performance]
4. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average over all models $f_i$. 