# Import modules

In [1]:
from sklearn.datasets import load_diabetes
import pickle
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import os
import torch


from synthcity.metrics.eval_performance import (
    PerformanceEvaluatorMLP,
    PerformanceEvaluatorXGB,
)
from synthcity.utils import reproducibility
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader


reproducibility.clear_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Plugins(categories=["generic"]).list()

assert device.type == 'cuda'


  from .autonotebook import tqdm as notebook_tqdm


# Reload

In [5]:
import importlib
import DGE_experiments
import DGE_data
import DGE_utils
importlib.reload(DGE_experiments)
importlib.reload(DGE_data)
importlib.reload(DGE_utils)


<module 'DGE_utils' from '/home/bv292/synthcity/DGE_utils.py'>

# Load data and settings

In [8]:
from DGE_data import get_real_and_synthetic

dataset = 'gaussian'  # real data
model_name = 'ctgan'  # synthetic data model

nsyn = 100  # number of synthetic data points per synthetic dataset
p_train = 0.7  # proportion of training data for generative model
n_models = 20  # number of models in ensemble
load = True  # results

load_syn = True  # data
save = True  # save results and data
verbose = False


workspace_folder = "workspace/"+dataset+"/"+model_name
results_folder = "uncertainty_results/"+dataset+"/"+model_name

X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
                                      p_train=p_train,
                                      n_models=n_models,
                                      model_name=model_name,
                                      load_syn=load_syn,
                                      verbose=verbose)

print(len(X_syns))
print(X_syns[0].shape)


n_total 40000 n_train: 4000


100%|██████████| 2000/2000 [03:26<00:00,  9.68it/s]
100%|██████████| 2000/2000 [03:27<00:00,  9.66it/s]
100%|██████████| 2000/2000 [03:26<00:00,  9.66it/s]
100%|██████████| 2000/2000 [03:24<00:00,  9.76it/s]
100%|██████████| 2000/2000 [03:25<00:00,  9.71it/s]
100%|██████████| 2000/2000 [03:26<00:00,  9.68it/s]
100%|██████████| 2000/2000 [03:23<00:00,  9.83it/s]
100%|██████████| 2000/2000 [03:22<00:00,  9.86it/s]
100%|██████████| 2000/2000 [03:26<00:00,  9.68it/s]
100%|██████████| 2000/2000 [03:27<00:00,  9.63it/s]
100%|██████████| 2000/2000 [03:26<00:00,  9.69it/s]
100%|██████████| 2000/2000 [03:24<00:00,  9.77it/s]
100%|██████████| 2000/2000 [03:24<00:00,  9.79it/s]
 27%|██▋       | 536/2000 [00:54<02:25, 10.06it/s]

# Analyses

## Model Training

In [7]:
from DGE_experiments import predictive_experiment

predictive_experiment(X_gt,
                      X_syns,
                      workspace_folder=workspace_folder,
                      results_folder=results_folder,
                      save=save,
                      load=load)


ValueError: y_true takes value in {42.0, 47.0, 49.0, 51.0, 52.0, 53.0, 57.0, 59.0, 61.0, 63.0, 64.0, 67.0, 68.0, 71.0, 72.0, 74.0, 75.0, 78.0, 84.0, 85.0, 86.0, 88.0, 89.0, 90.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 101.0, 103.0, 104.0, 107.0, 109.0, 110.0, 113.0, 118.0, 121.0, 122.0, 127.0, 128.0, 129.0, 131.0, 132.0, 135.0, 136.0, 137.0, 138.0, 141.0, 142.0, 144.0, 145.0, 150.0, 151.0, 155.0, 156.0, 158.0, 160.0, 163.0, 167.0, 168.0, 170.0, 171.0, 174.0, 175.0, 178.0, 179.0, 180.0, 181.0, 182.0, 186.0, 191.0, 192.0, 195.0, 196.0, 197.0, 198.0, 200.0, 208.0, 212.0, 214.0, 215.0, 217.0, 219.0, 220.0, 230.0, 232.0, 233.0, 235.0, 237.0, 244.0, 252.0, 257.0, 264.0, 270.0, 272.0, 275.0, 276.0, 281.0, 283.0, 292.0, 297.0, 302.0, 317.0, 321.0} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.

## Influence of synthetic data size

Let's study the effect of synthetic data size.

In [None]:
from DGE_experiments import predictive_varying_nsyn

predictive_varying_nsyn(X_gt, X_syns, dataset, model_name, n_models,
                        nsyn, results_folder, workspace_folder, load, save, verbose=True)


## Density estimation


In [None]:
from DGE_experiments import density_experiment

if X_gt.targettype is None:
    density_experiment(X_gt, X_syns, load, save)


# Model Evaluation

We compare the single baseline model vs the generative uncertainty model vs an oracle. Workflow.
0. Train and generate synthetic datasets $S_i$.
1. Take each synthetic dataset $S_i$ and split it up in train and test.
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic dataset's test set $S_{i,test}$, giving $\hat{M}^S_i$ [Single performance]
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $M_i$ [Oracle performance]
5. Evaluate on the other synthetic datasets $\cup_{j\neq i} S_{j}$, giving $\hat{M}^G_i$ [Generative performance]
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average over all models $f_i$. 
7. Repeat 1-6 for different model classes $f$

N.B. the idea of the above, is that the trained model $f_i$ is the same for each evaluation type. In the model selection section, we will compare the performance of different model classes, where we will train a new model for each evaluation type (hence the aim is to evaluate which class is best, while the model itself may vary).

In [None]:
from DGE_experiments import model_evaluation_experiment

if X_gt.targettype is not None:
    print('Aggregate approaches:')
    res = model_evaluation_experiment(X_gt, X_syns, 'mlp')[0]
    print(res)


Aggregate approaches:
Train model 1/20
Train model 2/20
Train model 3/20
Train model 4/20
Train model 5/20
Train model 6/20
Train model 7/20
Train model 8/20
Train model 9/20
Train model 10/20
Train model 11/20
Train model 12/20
Train model 13/20
Train model 14/20
Train model 15/20
Train model 16/20
Train model 17/20
Train model 18/20
Train model 19/20
Train model 20/20
                    r2                    mse               mae
oracle  -54.73 ± 18.88  869659.81 ± 106089.23  7269.96 ± 489.24
single  -42.79 ± 20.74  838280.18 ± 193193.89  6868.49 ± 813.91
us      -41.49 ± 17.64  854310.61 ± 107100.52  6956.84 ± 482.97


# Model Selection
Essentially repeat the above for different models

In [None]:
from DGE_experiments import model_selection_experiment

if X_gt.targettype is not None:
    model_selection_experiment(model_type='mlp', relative='l1', metric='accuracy')


Train model 1/20
Train model 2/20
Train model 3/20
Train model 4/20
Train model 5/20
Train model 6/20
Train model 7/20
Train model 8/20
Train model 9/20
Train model 10/20
Train model 11/20
Train model 12/20
Train model 13/20
Train model 14/20
Train model 15/20
Train model 16/20
Train model 17/20
Train model 18/20
Train model 19/20
Train model 20/20


KeyError: 'accuracy'

# Cross validation

We compare the single baseline model vs the generative uncertainty model. Single workflow
1. Take each synthetic dataset $S_i$ and split it up in train and test.
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic datasets test set $S_{i,test}$
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $\hat{M}_i$
5. Average results across the different synthetic datasets, giving $M_i$.
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average.

Versus our baseline
1. Take each synthetic dataset $S_i$ and split it up in train and test
2. Train a model $f_i$ on the train set, for each $S_i$
3. Evaluate on the same synthetic datasets test set $S_{i,test}
4. Evaluate on the true real test set (oracle), $D_{test}$, giving $\hat{M}_i$
5. Average results across the different synthetic datasets, giving $M_i$.
6. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average.


Cross-validation approach to test which type of model would perform best on real data. We compare the single baseline model vs the generative uncertainty model vs an oracle. Workflow Cross-validation.
0. Train and generate synthetic datasets $S_i$.
1. Use CV to train and evaluate models $f_i$ on each $S_i$. Repeat for all $S_i$. [Single performance]
2. Use CV \textit{over datasets $S_i$} (i.e. train on all but one $S_i$, evaluate on remaining and repeat) to train and evaluate models $f_i$.
3. For both cases, evaluate the model also on the true real test set (oracle), $D_{test}$, giving $M_i$ [Oracle performance]
4. Compute the deviation from the oracle, $||M_i - \hat{M}_i||$ and average over all models $f_i$. 

# Model parameter estimation
Using a linear model

# Calibration

In [None]:
from sklearn.calibration import calibration_curve


#prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=3)
