/
do_experiments_batch.py
96 lines (72 loc) · 3.75 KB
/
do_experiments_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from sklearn.datasets import load_diabetes
import pickle
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import os
import torch
from synthcity.metrics.eval_performance import (
PerformanceEvaluatorMLP,
PerformanceEvaluatorXGB,
)
from synthcity.utils import reproducibility
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader
reproducibility.clear_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Plugins(categories=["generic"]).list()
assert device.type == 'cuda'
from DGE_data import get_real_and_synthetic
from DGE_utils import get_folder_names
from DGE_experiments import model_evaluation_experiment, predictive_experiment
# let's restrict ourselves to classification datasets
datasets = ['covid' ]
#['moons', 'circles','cal_housing', 'adult', 'diabetes', 'breast_cancer', 'seer', 'cutract' ]
model_name = 'ctgan_deep' # synthetic data model
p_train = 0.8 # proportion of training data for generative model. Default values if None
n_models = 20 # number of models in ensemble
#max_n = 5000 # maximum number of data points to use for training generative model.
#nsyn = 10000 # number of synthetic data points per synthetic dataset. Defaults to same as generative training size if None
load = True # results
load_syn = True # data
save = True # save results and data
verbose = False
for nsyn in [2000, 5000]:
for max_n in [2000, 5000, 10000]:#, 5000, 10000]:
if max_n>nsyn:
continue
for dataset in datasets:#datasets:
for model_name in ['ctgan_deep', 'ctgan', 'ctgan_shallow']:
print('Dataset:', dataset)
workspace_folder, results_folder = get_folder_names(
dataset, model_name, max_n=max_n, nsyn=nsyn)
X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
p_train=p_train,
n_models=n_models,
model_name=model_name,
load_syn=load_syn,
verbose=verbose,
max_n=max_n)
y_preds, scores = predictive_experiment(X_gt,
X_syns,
workspace_folder=workspace_folder,
results_folder=results_folder,
save=save,
load=load,
plot=True,
)
means, std = model_evaluation_experiment(X_gt, X_syns, workspace_folder=workspace_folder, relative='',
model_type='deepish_mlp',
load=load,
save=load,
verbose=verbose
)
means, std = model_evaluation_experiment(X_gt, X_syns, workspace_folder=workspace_folder, relative='',
model_type='mlp',
load=load,
save=load,
verbose=verbose
)