In [1]:
import warnings
warnings.filterwarnings('ignore')

"""device setting"""
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
"""load dataset and specify column types"""
import pandas as pd
data = pd.read_csv('./whitewine.csv', delimiter=";")
columns = list(data.columns)
columns.remove("quality")
assert data.isna().sum().sum() == 0
continuous_features = columns
categorical_features = ["quality"]
integer_features = []

### the target column should be the last column
data = data[continuous_features + categorical_features] 
# len(data)

"""training, test, synthetic datasets"""
data[categorical_features] = data[categorical_features].apply(
    lambda col: col.astype('category').cat.codes + 1) # pre-processing

train = data.iloc[:4000]
test = data.iloc[4000:]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [3]:
"""MaCoDE"""
from macode import macode

macode = macode.MaCoDE(
    data=train, # the observed tabular dataset
    continuous_features=continuous_features, # the list of continuous columns of data
    categorical_features=categorical_features, # the list of categorical columns of data
    integer_features=integer_features, # the list of integer-type columns of data
    
    seed=42, # seed for repeatable results
    bins=100, # the number of bins for discretization
    dim_transformer=128, # the embedding size (input dimension size of transformer)
    num_transformer_heads=8, # the number of heads in transformer
    num_transformer_layer=2, # the number of layers in transformer
    
    epochs=10, # the number of epochs (for quick checking)
    batch_size=1024, # the batch size
    lr=0.001, # learning rate
    device=device,
)

The number of bins: 100


Tranform Continuous Features...: 100%|██████████| 11/11 [00:00<00:00, 88.67it/s]


In [4]:
"""training"""
macode.train()

Training...:  10%|█         | 1/10 [00:02<00:19,  2.21s/it]

[epoch 001], loss: 50.7411


Training...:  20%|██        | 2/10 [00:03<00:15,  1.96s/it]

[epoch 002], loss: 47.0614


Training...:  30%|███       | 3/10 [00:05<00:13,  1.88s/it]

[epoch 003], loss: 45.9010


Training...:  40%|████      | 4/10 [00:07<00:11,  1.88s/it]

[epoch 004], loss: 45.4095


Training...:  50%|█████     | 5/10 [00:09<00:09,  1.85s/it]

[epoch 005], loss: 45.0516


Training...:  60%|██████    | 6/10 [00:11<00:07,  1.84s/it]

[epoch 006], loss: 44.8891


Training...:  70%|███████   | 7/10 [00:13<00:05,  1.83s/it]

[epoch 007], loss: 44.7241


Training...:  80%|████████  | 8/10 [00:14<00:03,  1.82s/it]

[epoch 008], loss: 44.7550


Training...:  90%|█████████ | 9/10 [00:16<00:01,  1.82s/it]

[epoch 009], loss: 44.6026


Training...: 100%|██████████| 10/10 [00:18<00:00,  1.85s/it]

[epoch 010], loss: 44.5839





In [5]:
"""generate synthetic data"""
syndata = macode.generate_data(n=len(train), tau=1.)
syndata

Generate Synthetic Dataset...: 100%|██████████| 63/63 [00:14<00:00,  4.23it/s]


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.8,0.29,0.320000,9.900000,0.050,46.0,202.000000,0.996800,3.32,0.50,11.0,3
1,6.9,0.24,0.320000,1.000000,0.038,25.0,122.936468,0.995085,3.14,0.50,12.2,4
2,7.4,0.24,0.240000,5.100000,0.051,17.0,131.000000,0.994340,3.25,0.54,9.1,5
3,7.7,0.29,0.156692,1.600000,0.045,34.0,75.000000,0.993500,3.00,0.53,11.2,3
4,6.8,0.16,0.300000,6.100000,0.016,51.0,93.000000,0.994500,2.97,0.68,11.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,7.5,0.36,0.320000,17.800000,0.046,11.0,140.000000,0.990762,3.17,0.51,10.2,6
3996,7.4,0.20,0.330000,8.900000,0.043,66.0,157.000000,0.997932,3.36,0.35,9.0,6
3997,8.7,0.20,0.280000,10.564929,0.036,23.0,247.822232,0.995710,3.14,0.49,10.2,4
3998,6.4,0.36,0.210000,5.400000,0.035,63.0,174.000000,0.998000,2.97,0.49,11.9,3


In [6]:
"""Evaluate Synthetic Data Quality"""
from synthetic_eval import evaluation

target = "quality"
results = evaluation.evaluate(
    syndata, train, test, 
    target, continuous_features, categorical_features, device
)


1. Statistical Fidelity

(marginal) KL-Divergence...

(marginal) Goodness Of Fit...

(joint) MMD...

(joint) Cramer-Wold Distance...

(joint) alpha-precision, beta-recall...


2. Machine Learning Utility

Classification downstream task...

(Baseline) Classification: Accuracy...
[logit] ACC: 0.548
[KNN] ACC: 0.506
[RBF-SVM] ACC: 0.596
[RandomForest] ACC: 0.555
[GradBoost] ACC: 0.573
[AdaBoost] ACC: 0.467
(Synthetic) Classification: Accuracy...
[logit] ACC: 0.537
[KNN] ACC: 0.383
[RBF-SVM] ACC: 0.493
[RandomForest] ACC: 0.424
[GradBoost] ACC: 0.457
[AdaBoost] ACC: 0.360

3. Privacy Preservability

K-anonimity...

K-Map...

Distance to Closest Record...

Attribute Disclosure...



In [7]:
"""print results"""
for x, y in results._asdict().items():
    print(f"{x}: {y:.3f}")

KL: 0.023
GoF: 0.017
MMD: 0.012
CW: 0.045
alpha_precision: 0.965
beta_recall: 0.075
base_cls: 0.541
syn_cls: 0.442
model_selection: 0.657
feature_selection: 0.088
Kanon_base: 2.150
Kanon_syn: 1.775
KMap: 1.700
DCR_RS: 0.149
DCR_RR: 0.000
DCR_SS: 0.150
AD: 0.381
