In [1]:
import warnings
warnings.filterwarnings('ignore')

"""device setting"""
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
"""load dataset and specify column types"""
import pandas as pd
data = pd.read_csv('./whitewine.csv', delimiter=";")
columns = list(data.columns)
columns.remove("quality")
assert data.isna().sum().sum() == 0
continuous_features = columns
categorical_features = ["quality"]
integer_features = []

### the target column should be the last column
data = data[continuous_features + categorical_features] 
# len(data)

"""training, test, synthetic datasets"""
data[categorical_features] = data[categorical_features].apply(
    lambda col: col.astype('category').cat.codes + 1) # pre-processing

train = data.iloc[:4000]
test = data.iloc[4000:]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [3]:
from macode import missing
import numpy as np
seed = 42 # randomness for generating missingness patterns
missing_rate = 0.3 # range: 0 ~ 1 (float)
missing_type = "MAR" # None(complete data), MCAR, MAR, MNARL, MNARQ

mask = missing.generate_mask(
    torch.from_numpy(train.values).float(), 
    missing_rate, 
    missing_type,
    seed=seed
)
print(mask)
train.mask(mask.astype(bool), np.nan, inplace=True)
print(train)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.0              0.27         0.36            20.7      0.045   
1               6.3               NaN         0.34             1.6      0.049   
2               8.1               NaN         0.40             6.9      0.050   
3               7.2              0.23         0.32             8.5        NaN   
4               7.2              0.23         0.32             NaN      0.058   
...             ...               ...          ...             ...        ...   
3995            8.0               NaN          NaN             NaN      0.038   
3996            6.7              0.28          NaN             8.9      0.048   
3997            6.0               NaN         0.29             3.1      0.041   
3998            6.4              0

In [4]:
"""MaCoDE"""
from macode import macode

macode = macode.MaCoDE(
    data=train, # the observed tabular dataset
    continuous_features=continuous_features, # the list of continuous columns of data
    categorical_features=categorical_features, # the list of categorical columns of data
    integer_features=integer_features, # the list of integer-type columns of data
    
    seed=42, # seed for repeatable results
    bins=100, # the number of bins for discretization
    dim_transformer=128, # the embedding size (input dimension size of transformer)
    num_transformer_heads=8, # the number of heads in transformer
    num_transformer_layer=2, # the number of layers in transformer
    
    epochs=10, # the number of epochs (for quick checking)
    batch_size=1024, # the batch size
    lr=0.001, # learning rate
    device=device,
)

The number of bins: 100


Tranform Continuous Features...: 100%|██████████| 11/11 [00:00<00:00, 126.97it/s]


In [5]:
"""training"""
macode.train()

Training...:  10%|█         | 1/10 [00:02<00:21,  2.34s/it]

[epoch 001], loss: 50.7949


Training...:  20%|██        | 2/10 [00:04<00:16,  2.02s/it]

[epoch 002], loss: 47.1773


Training...:  30%|███       | 3/10 [00:05<00:13,  1.90s/it]

[epoch 003], loss: 46.0398


Training...:  40%|████      | 4/10 [00:07<00:11,  1.87s/it]

[epoch 004], loss: 45.4765


Training...:  50%|█████     | 5/10 [00:09<00:09,  1.91s/it]

[epoch 005], loss: 45.1733


Training...:  60%|██████    | 6/10 [00:11<00:07,  1.89s/it]

[epoch 006], loss: 44.9800


Training...:  70%|███████   | 7/10 [00:13<00:05,  1.90s/it]

[epoch 007], loss: 44.8348


Training...:  80%|████████  | 8/10 [00:15<00:03,  1.98s/it]

[epoch 008], loss: 44.8856


Training...:  90%|█████████ | 9/10 [00:17<00:01,  2.00s/it]

[epoch 009], loss: 44.7178


Training...: 100%|██████████| 10/10 [00:19<00:00,  1.95s/it]

[epoch 010], loss: 44.7022





In [6]:
"""missing data imputation"""
imputed = macode.impute(tau=1.)
imputed

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.360000,20.7,0.045,45.0,170.0,0.993900,3.07,0.45,8.8,3.0
1,6.3,0.22,0.340000,1.6,0.049,14.0,132.0,0.994000,3.03,0.49,9.5,4.0
2,8.1,0.20,0.400000,6.9,0.050,30.0,97.0,0.995100,3.11,0.44,10.1,4.0
3,7.2,0.23,0.320000,8.5,0.049,35.0,212.5,0.989914,3.19,0.40,9.9,4.0
4,7.2,0.23,0.320000,1.1,0.058,47.0,186.0,0.995600,3.19,0.40,9.9,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,8.0,0.20,0.290000,1.5,0.038,42.0,127.0,0.991860,2.90,0.72,12.1,3.0
3996,6.7,0.28,0.984721,8.9,0.048,32.0,111.0,0.994550,3.25,0.54,11.0,5.0
3997,6.0,0.55,0.290000,3.1,0.041,37.0,144.0,0.989440,3.22,0.39,12.8,5.0
3998,6.4,0.24,0.490000,14.9,0.053,25.0,120.0,0.994200,3.01,0.98,10.5,4.0


In [11]:
"""Evaluate Synthetic Data Quality"""
from synthetic_eval import evaluation

true = data.iloc[:4000] # original training dataset
target = "quality"
results = evaluation.evaluate(
    imputed, true, test, 
    target, continuous_features, categorical_features, device
)


1. Statistical Fidelity

(marginal) KL-Divergence...

(marginal) Goodness Of Fit...

(joint) MMD...

(joint) Cramer-Wold Distance...

(joint) alpha-precision, beta-recall...


2. Machine Learning Utility

Classification downstream task...

(Baseline) Classification: Accuracy...
[logit] ACC: 0.548
[KNN] ACC: 0.506
[RBF-SVM] ACC: 0.596
[RandomForest] ACC: 0.555
[GradBoost] ACC: 0.573
[AdaBoost] ACC: 0.467
(Synthetic) Classification: Accuracy...
[logit] ACC: 0.530
[KNN] ACC: 0.442
[RBF-SVM] ACC: 0.545
[RandomForest] ACC: 0.512
[GradBoost] ACC: 0.538
[AdaBoost] ACC: 0.382

3. Privacy Preservability

K-anonimity...

K-Map...

Distance to Closest Record...

Attribute Disclosure...



In [12]:
"""print results"""
for x, y in results._asdict().items():
    print(f"{x}: {y:.3f}")

KL: 0.011
GoF: 0.027
MMD: 0.016
CW: 0.034
alpha_precision: 0.981
beta_recall: 0.355
base_cls: 0.541
syn_cls: 0.491
model_selection: 0.943
feature_selection: 0.527
Kanon_base: 2.150
Kanon_syn: 1.175
KMap: 1.150
DCR_RS: 0.006
DCR_RR: 0.000
DCR_SS: 0.135
AD: 0.589
