/
create_synthetic.py
69 lines (56 loc) · 2.27 KB
/
create_synthetic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from sklearn.datasets import load_diabetes
import pickle
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import os
import torch
from synthcity.metrics.eval_performance import (
PerformanceEvaluatorMLP,
PerformanceEvaluatorXGB,
)
from synthcity.utils import reproducibility
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader
reproducibility.clear_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Plugins(categories=["generic"]).list()
assert device.type == 'cuda'
from DGE_data import get_real_and_synthetic
# let's restrict ourselves to classification datasets
datasets = ['moons','circles','adult','breast_cancer','covid','seer']
#['moons', 'circles','cal_housing', 'adult', 'diabetes', 'breast_cancer', 'seer', 'cutract' ]
model_name = 'dpgan' # synthetic data model
p_train = 0.8 # proportion of training data for generative model. Default values if None
n_models = 20 # number of models in ensemble
load = True # results
load_syn = True # data
save = True # save results and data
verbose = False
# for max_n in [10000]:#2000, 5000, 10000]:#, 5000, 10000]:
# for dataset in datasets:
# print('Dataset:', dataset)
# get_real_and_synthetic(dataset=dataset,
# p_train=p_train,
# n_models=n_models,
# model_name=model_name,
# load_syn=load_syn,
# verbose=verbose,
# max_n=max_n)
num_runs = 10
dataset='seer'
for max_n in [2000,5000]:
nsyn = max_n
if max_n == 5000:
num_runs = 1
X_gt, X_syns = get_real_and_synthetic(dataset=dataset,
p_train=p_train,
n_models=n_models*num_runs,
model_name=model_name,
load_syn=load_syn,
verbose=verbose,
max_n=max_n,
nsyn=nsyn)