# Import modules

In [1]:
from sklearn.datasets import load_diabetes
import pickle
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import os


from synthcity.metrics.eval_performance import (
    PerformanceEvaluatorMLP,
    PerformanceEvaluatorXGB,
)
from synthcity.utils import reproducibility
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader

from bnaf.toy2d import main as bnaf



Plugins(categories=["generic"]).list()

  from .autonotebook import tqdm as notebook_tqdm


['copulagan',
 'tvae',
 'privbayes',
 'pategan',
 'rtvae',
 'bayesian_network',
 'dpgan',
 'adsgan',
 'gaussian_copula',
 'ctgan',
 'sdv_tvae',
 'nflow',
 'sdv_ctgan']

# Load data and settings

In [2]:
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_boston
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_wine
from sklearn.datasets import load_digits
from sklearn.datasets import make_moons
from bnaf.data.generate2d import sample2d

dataset = 'checkerboard'
p_train = 0.5
nsyn = 5000

if dataset=='diabetes':
    X, y = load_diabetes(return_X_y=True, as_frame=True)
elif dataset=='iris':
    X, y = load_iris(return_X_y=True, as_frame=True)
elif dataset=='boston':
    X, y = load_boston(return_X_y=True, as_frame=True)
elif dataset=='breast_cancer':
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
elif dataset=='wine':
    X, y = load_wine(return_X_y=True, as_frame=True)
elif dataset=='digits':
    X, y = load_digits(return_X_y=True, as_frame=True)
elif dataset=='moons':
    X, y = make_moons(n_samples=2000, noise=0.2, random_state=0)
    X = pd.DataFrame(X)
    p_train = 0.1
elif dataset in ["8gaussians", "2spirals", "checkerboard", "t1", "t2", "t3", "t4"]:
    X = sample2d(dataset, 20000)
    X = pd.DataFrame(X)
    y = -np.ones(X.shape[0])
    p_train = 0.1
else:
    raise ValueError('Unknown dataset')

X["target"] = y
X_gt = GenericDataLoader(X, target_column="target", train_size=p_train)

if len(np.unique(y))==1:
    X_gt.targettype = None
elif len(np.unique(y))<=10:
    X_gt.targettype = 'classification'
else:
    X_gt.targettype = 'regression'

X_train, X_test = X_gt.train(), X_gt.test()

n_train = X_train.shape[0]
results_folder = "uncertainty_results/"+dataset
n_models = 20
load = True
print(n_train)



2000


# Generate data

In [3]:
for i in range(n_models):
    os.makedirs(results_folder, exist_ok=True)
    filename = f"{results_folder}/Xsyn_n{n_train}_seed{i}.pkl"
    # filename2 = f"{results_folder}/X_syn_{i}.pkl"
    # if os.path.exists(filename2):
    #     os.rename(filename2, filename)    
    if os.path.exists(filename) and load:
        continue
    print(f"Training model {i+1}/{n_models}")

    syn_model = Plugins().get('tvae')
    reproducibility.enable_reproducible_results(seed=i)
    syn_model.fit(X_train)
    X_syn = syn_model.generate(count = nsyn)
    # save X_syn to disk as pickle
    pickle.dump(X_syn, open(filename, "wb"))





# Load data

In [4]:
X_syns = []
for i in range(n_models):
    # load X_syn from disk as pickle
    X_syn = pickle.load(open(f"{results_folder}/Xsyn_n{n_train}_seed{i}.pkl", "rb"))
    X_syns.append(X_syn)

# Analyses
## Some function definitions

In [5]:

def prediction_task(X_gt, X_syn, model=None):
    if type(model)==str or model is None:
        if model=='mlp' or model is None:
            if X_gt.targettype == 'classification':
                model = sklearn.neural_network.MLPClassifier()
            else:
                model = sklearn.neural_network.MLPRegressor()
        elif model=='rf':
            if X_gt.targettype == 'classification':
                model = sklearn.ensemble.RandomForestClassifier()
            else:
                model = sklearn.ensemble.RandomForestRegressor()
        X, y = X_syn.unpack(as_numpy=True)
        model.fit(X, y.reshape(-1,1))
    pred = model.predict(X_gt.unpack(as_numpy=True)[0])
    return pred, model


def meanstd(A):
    return np.mean(A, axis=0), np.std(A, axis=0)


def density_estimation(X_gt, X_syn, model=None):
    model = bnaf(X_syn.unpack(as_numpy=True)[0])
    return model(X_gt.unpack(as_numpy=True)[0]), model

def aggregate(X_gt, X_syns, task, models=None):
    results = []
    trained_models = []
    for i in range(len(X_syns)):
        if models is None:
            model = None
        else:
            model = models[i]
        res, model = task(X_gt, X_syns[i], model)
        results.append(res)
        trained_models.append(model)
    return *meanstd(results), trained_models

def tsne(X):
    from sklearn.manifold import TSNE
    tsne = TSNE(n_components=2, random_state=0)
    X_2d = tsne.fit_transform(X)
    return X_2d

def aggregate_imshow(X_gt, X_syns, task, models=None):
    xmin, ymin = np.min(X_gt.unpack(as_numpy=True)[0], axis=0)*1.5
    xmax, ymax = np.max(X_gt.unpack(as_numpy=True)[0], axis=0)*1.5
    
    steps = 400
    X_grid = np.linspace(xmin, xmax, steps)
    Y_grid = np.linspace(ymin, ymax, steps)

    X_grid, Y_grid = np.meshgrid(X_grid, Y_grid)
    X_grid = pd.DataFrame(np.c_[X_grid.ravel(), Y_grid.ravel()])
    X_grid['target'] = -1
    X_grid = GenericDataLoader(X_grid, target_column="target", train_size=0.01)

    y_pred_mean, y_pred_std, models = aggregate(X_grid, X_syns, task, models)

    for y, name in zip((y_pred_mean, y_pred_std), ('mean', 'std')):
        plt.figure(figsize=(8,6), dpi=100)
        plt.imshow(y.reshape(steps,steps)[::-1], cmap='viridis', extent=[xmin, xmax, ymin, ymax])
        X_train, y_train = X_gt.train().unpack(as_numpy=True)
        if len(np.unique(y_train))==2:
            y_train = y_train.astype(bool)        
            plt.scatter(X_train[y_train,0], X_train[y_train,1], c='k', marker='.')
            plt.scatter(X_train[~y_train,0], X_train[~y_train,1], c='w', marker='.')
        plt.colorbar()
        plt.show()



## Predictions across models

In [6]:
if X_gt.targettype is not None:
    X_test = X_gt.test()
    X_test.targettype = X_gt.targettype
    y_pred_mean, y_pred_std, models = aggregate(X_test, X_syns, prediction_task, models=['mlp']*len(X_syns))

    print('Predictions on test set:')
    X, y = X_gt.test().unpack(as_numpy=True)
    if X.shape[1]>2:
        Xplot = tsne(X)
    else:
        Xplot = X
    for y in [y_pred_mean, y_pred_std]:
        plt.scatter(Xplot[:,0], Xplot[:,1], c=y, cmap='viridis')
        plt.colorbar()
        plt.show()
    
    print('Predictions on full space:')
    aggregate_imshow(X_gt, X_syns, prediction_task, models)


## Density estimation
We ignore the target and regard density estimation as the downstream task. We use the flow-based BNAF from Cao as DE method.

In [9]:
from bnaf.toy2d import parse
x = parse()
print(x)

Namespace(batch_dim=200, clip_norm=0.1, dataset='8gaussians', decay=0.5, device='cuda:0', experiment='density2d', expname='', flows=5, hidden_dim=50, layers=3, learning_rate=0.1, load=None, patience=2000, reduce_extreme=False, save=False, savefig=False, steps=20000)


In [None]:
if X_gt.targettype is None:
    X_test = X_gt.test()
    X_test.targettype = X_gt.targettype
    y_pred_mean, y_pred_std, models = aggregate_imshow(X_test, X_syns, density_estimation)
    



usage: ipykernel_launcher.py [-h] [--device DEVICE]
                             [--dataset {8gaussians,2spirals,checkerboard,t1,t2,t3,t4}]
                             [--experiment {density2d,energy2d}]
                             [--learning_rate LEARNING_RATE]
                             [--batch_dim BATCH_DIM] [--clip_norm CLIP_NORM]
                             [--steps STEPS] [--patience PATIENCE]
                             [--decay DECAY] [--flows FLOWS] [--layers LAYERS]
                             [--hidden_dim HIDDEN_DIM] [--expname EXPNAME]
                             [--load LOAD] [--save] [--savefig]
                             [--reduce_extreme]
ipykernel_launcher.py: error: argument --flows: invalid int value: '/home/bv292/.local/share/jupyter/runtime/kernel-v2-506VOGHrI8jrdHu.json'


AssertionError: 