In [1]:
!nvidia-smi

Mon Jul 22 05:38:12 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:06:00.0 Off |                    0 |
| N/A   47C    P8    17W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

<center>
  
# TABSYN: Tabular Data Synthesis with Diffusion Models

</center>

Introduction goes here.

# Imports and Setup

In [2]:
import os
import src
import json
import numpy as np
import pandas as pd
import torch

from torch.utils.data import DataLoader

from scripts.download_dataset import download_from_uci
from scripts.process_dataset import process_data

from src.data import preprocess, TabularDataset

from src.baselines.tabsyn.pipeline import TabSyn


NAME_URL_DICT_UCI = {
    "adult": "https://archive.ics.uci.edu/static/public/2/adult.zip",
    "default": "https://archive.ics.uci.edu/static/public/350/default+of+credit+card+clients.zip",
    "magic": "https://archive.ics.uci.edu/static/public/159/magic+gamma+telescope.zip",
    "shoppers": "https://archive.ics.uci.edu/static/public/468/online+shoppers+purchasing+intention+dataset.zip",
    "beijing": "https://archive.ics.uci.edu/static/public/381/beijing+pm2+5+data.zip",
    "news": "https://archive.ics.uci.edu/static/public/332/online+news+popularity.zip",
}

DATA_DIR = "/projects/aieng/diffusion_bootcamp/data/tabular_copy"
RAW_DATA_DIR = f"{DATA_DIR}/raw_data"
PROCESSED_DATA_DIR = f"{DATA_DIR}/processed_data"
SYNTH_DATA_DIR = f"{DATA_DIR}/synthetic_data"
DATA_NAME = "adult"

MODEL_PATH = f"/projects/aieng/diffusion_bootcamp/models/tabular/tabsyn_copy"

  from .autonotebook import tqdm as notebook_tqdm


# Adult Dataset

For more detailed explaination, please refer to the TabDDPM reference implementation.

In [3]:
# download data
download_from_uci(DATA_NAME, RAW_DATA_DIR, NAME_URL_DICT_UCI)

# process data
INFO_DIR = "data_info"
process_data(DATA_NAME, INFO_DIR, DATA_DIR)

# review data
df = pd.read_csv(f"{PROCESSED_DATA_DIR}/{DATA_NAME}/train.csv")
print("\n============\n", df.head(5), "\n============\n")

# clean data
value = " ?"
if value in df.values:
    print(f"{value} exists in the DataFrame.")
else:
    print(f"{value} does not exist in the DataFrame.")

# review json file and its contents
with open(f"{PROCESSED_DATA_DIR}/{DATA_NAME}/info.json", "r") as file:
    data_info = json.load(file)
print(data_info)

Start processing dataset adult from UCI.
Aready downloaded.
adult (32561, 15) (16281, 15) (32561, 15)
Numerical (32561, 6)
Categorical (32561, 8)
Processing and Saving adult Successfully!
adult
Total 48842
Train 32561
Test 16281
Num 6
Cat 9

     age          workclass    fnlwgt   education  education.num  \
0  39.0          State-gov   77516.0   Bachelors           13.0   
1  50.0   Self-emp-not-inc   83311.0   Bachelors           13.0   
2  38.0            Private  215646.0     HS-grad            9.0   
3  53.0            Private  234721.0        11th            7.0   
4  28.0            Private  338409.0   Bachelors           13.0   

        marital.status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-clea

# TabSyn Algorithem

Explanation of the algorithm goes here.

<p align="center">
<img src="figures/tabsyn.jpg" width="1000"/>
</p>

## Load Config

In [4]:
config_path = f"src/baselines/tabsyn/configs/{DATA_NAME}.toml"
raw_config = src.load_config(config_path)

print(raw_config)

{'task_type': 'binclass', 'loss_params': {'max_beta': 0.01, 'min_beta': 1e-05, 'lambd': 0.7}, 'model_params': {'vae': {'n_head': 1, 'factor': 32, 'num_layers': 2, 'd_token': 4}}, 'train': {'vae': {'num_epochs': 10, 'batch_size': 4096, 'optim': {'lr': 0.001, 'weight_decay': 0, 'factor': 0.95, 'patience': 10}}, 'diffusion': {'num_epochs': 10, 'batch_size': 4096, 'optim': {'lr': 0.001, 'weight_decay': 0, 'factor': 0.9, 'patience': 20}}}, 'sample': {'steps': 50}}


## Make Dataset

In [5]:
X_num, X_cat, categories, d_numerical = preprocess(f"{PROCESSED_DATA_DIR}/{DATA_NAME}", task_type=raw_config['task_type'])

X_train_num, X_test_num = X_num
X_train_cat, X_test_cat = X_cat

X_train_num, X_test_num = torch.tensor(X_train_num).float(), torch.tensor(X_test_num).float()
X_train_cat, X_test_cat =  torch.tensor(X_train_cat), torch.tensor(X_test_cat)

train_data = TabularDataset(X_train_num.float(), X_train_cat)

# put the test data on the gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

X_test_num = X_test_num.float().to(device)
X_test_cat = X_test_cat.to(device)

# create the train dataloader
train_loader = DataLoader(
    train_data,
    batch_size = raw_config["train"]["vae"]["batch_size"],
    shuffle = True,
    num_workers = 4,
)

data_path /projects/aieng/diffusion_bootcamp/data/tabular_copy/processed_data/adult
No NaNs in numerical features, skipping


In [6]:
print("train dataset num shape: ", train_data.X_num.shape)
print("test dataset num shape: ", X_test_num.shape)

print("train dataset cat shape: ", train_data.X_cat.shape)
print("test dataset cat shape: ", X_test_cat.shape)

train dataset num shape:  torch.Size([32561, 6])
test dataset num shape:  torch.Size([16281, 6])
train dataset cat shape:  torch.Size([32561, 9])
test dataset cat shape:  torch.Size([16281, 9])


## Instantiate Model

In [7]:
tabsyn = TabSyn(train_loader, X_test_num, X_test_cat, device, num_numerical_features=d_numerical, num_classes=categories)

model, pre_encoder, pre_decoder = tabsyn.get_vae_model(**raw_config["model_params"]["vae"])

optimizer, scheduler = tabsyn.load_optim(model, **raw_config["train"]["vae"]["optim"])

self.category_embeddings.weight.shape=torch.Size([104, 4])
self.category_embeddings.weight.shape=torch.Size([104, 4])


## Train Model
### First Train VAE

In [8]:
model, pre_encoder, pre_decoder = tabsyn.train_vae(model, pre_encoder, pre_decoder, 
                                                   optimizer, scheduler, **raw_config["loss_params"], 
                                                   num_epochs = raw_config["train"]["vae"]["num_epochs"], 
                                                   model_save_path = f"{MODEL_PATH}/{DATA_NAME}/vae/model.pt",
                                                   encoder_save_path = f"{MODEL_PATH}/{DATA_NAME}/vae/encoder.pt", 
                                                   decoder_save_path = f"{MODEL_PATH}/{DATA_NAME}/vae/decoder.pt",  device = device)

Epoch 1/10: 100%|██████████| 8/8 [00:02<00:00,  3.45it/s]


epoch: 0, beta = 0.010000, Train MSE: 11.010106, Train CE:2.190319, Train KL:0.631821, Val MSE:8.316008, Val CE:2.108248, Train ACC:0.277735, Val ACC:0.282088


Epoch 2/10: 100%|██████████| 8/8 [00:01<00:00,  6.27it/s]


epoch: 1, beta = 0.010000, Train MSE: 6.690540, Train CE:2.091688, Train KL:0.635280, Val MSE:4.871817, Val CE:2.069340, Train ACC:0.282078, Val ACC:0.284046


Epoch 3/10: 100%|██████████| 8/8 [00:01<00:00,  6.46it/s]


epoch: 2, beta = 0.010000, Train MSE: 3.768258, Train CE:2.027702, Train KL:0.815249, Val MSE:2.518736, Val CE:1.964295, Train ACC:0.319934, Val ACC:0.325082


Epoch 4/10: 100%|██████████| 8/8 [00:01<00:00,  6.62it/s]


epoch: 3, beta = 0.010000, Train MSE: 1.823395, Train CE:1.918054, Train KL:1.096955, Val MSE:1.131964, Val CE:1.863093, Train ACC:0.332476, Val ACC:0.334459


Epoch 5/10: 100%|██████████| 8/8 [00:01<00:00,  6.11it/s]


epoch: 4, beta = 0.010000, Train MSE: 0.834314, Train CE:1.834498, Train KL:1.435878, Val MSE:0.610274, Val CE:1.795356, Train ACC:0.323762, Val ACC:0.324966


Epoch 6/10: 100%|██████████| 8/8 [00:01<00:00,  6.64it/s]


epoch: 5, beta = 0.010000, Train MSE: 0.585323, Train CE:1.734624, Train KL:1.741091, Val MSE:0.586413, Val CE:1.645748, Train ACC:0.340876, Val ACC:0.357158


Epoch 7/10: 100%|██████████| 8/8 [00:01<00:00,  6.59it/s]


epoch: 6, beta = 0.010000, Train MSE: 0.552036, Train CE:1.587937, Train KL:1.955335, Val MSE:0.509859, Val CE:1.511510, Train ACC:0.476644, Val ACC:0.481557


Epoch 8/10: 100%|██████████| 8/8 [00:01<00:00,  7.13it/s]


epoch: 7, beta = 0.010000, Train MSE: 0.470287, Train CE:1.454352, Train KL:2.177709, Val MSE:0.439621, Val CE:1.372933, Train ACC:0.520585, Val ACC:0.522101


Epoch 9/10: 100%|██████████| 8/8 [00:01<00:00,  6.67it/s]


epoch: 8, beta = 0.010000, Train MSE: 0.415161, Train CE:1.299456, Train KL:2.430321, Val MSE:0.396248, Val CE:1.206525, Train ACC:0.565355, Val ACC:0.579462


Epoch 10/10: 100%|██████████| 8/8 [00:01<00:00,  6.40it/s]

epoch: 9, beta = 0.010000, Train MSE: 0.379557, Train CE:1.143850, Train KL:2.742451, Val MSE:0.368619, Val CE:1.071381, Train ACC:0.691323, Val ACC:0.693426
Training time: 0.2356 mins
Successfully load and save the model!





In [9]:
# embed all inputs in the latent space
tabsyn.save_vae_embeddings(pre_encoder, 
                           X_train_num, X_train_cat,
                           vae_ckpt_dir = f"{MODEL_PATH}/{DATA_NAME}/vae", 
                           device = device)

Successfully save pretrained embeddings in disk!


### Train Diffusion Model

In [10]:
# load latent space embeddings
train_z, _ = tabsyn.load_vae_embeddings(vae_ckpt_dir = f"{MODEL_PATH}/{DATA_NAME}/vae")

# normalize embeddings
mean, std = train_z.mean(0), train_z.std(0)
train_z = (train_z - mean) / 2
train_data = train_z

# create data loader
train_loader = DataLoader(
    train_data,
    batch_size = raw_config["train"]["diffusion"]["batch_size"],
    shuffle = True,
    num_workers = 4,
)

In [11]:
# load diffusion model
model = tabsyn.get_diffusion_model(in_dim = train_z.shape[1], hid_dim = train_z.shape[1], device = device)

# load optimizer and scheduler
optimizer, scheduler = tabsyn.load_optim(model, **raw_config["train"]["diffusion"]["optim"])

MLPDiffusion(
  (proj): Linear(in_features=60, out_features=1024, bias=True)
  (mlp): Sequential(
    (0): Linear(in_features=1024, out_features=2048, bias=True)
    (1): SiLU()
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): SiLU()
    (4): Linear(in_features=2048, out_features=1024, bias=True)
    (5): SiLU()
    (6): Linear(in_features=1024, out_features=60, bias=True)
  )
  (map_noise): PositionalEmbedding()
  (time_embed): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): SiLU()
    (2): Linear(in_features=1024, out_features=1024, bias=True)
  )
)
the number of parameters 10616892


In [12]:
# train diffusion model
tabsyn.train_diffusion(model, train_loader, optimizer, scheduler, 
                       num_epochs = raw_config["train"]["diffusion"]["num_epochs"],
                       ckpt_path = f"{MODEL_PATH}/{DATA_NAME}",
                       device = device)

Epoch 1/10: 100%|██████████| 8/8 [00:00<00:00,  8.07it/s, Loss=0.86] 
Epoch 2/10: 100%|██████████| 8/8 [00:00<00:00,  8.41it/s, Loss=0.772]
Epoch 3/10: 100%|██████████| 8/8 [00:00<00:00,  8.34it/s, Loss=0.732]
Epoch 4/10: 100%|██████████| 8/8 [00:00<00:00,  8.18it/s, Loss=0.688]
Epoch 5/10: 100%|██████████| 8/8 [00:00<00:00,  8.26it/s, Loss=0.657]
Epoch 6/10: 100%|██████████| 8/8 [00:00<00:00,  8.18it/s, Loss=0.594]
Epoch 7/10: 100%|██████████| 8/8 [00:00<00:00,  8.22it/s, Loss=0.593]
Epoch 8/10: 100%|██████████| 8/8 [00:00<00:00,  8.69it/s, Loss=0.68] 
Epoch 9/10: 100%|██████████| 8/8 [00:00<00:00,  8.23it/s, Loss=0.524]
Epoch 10/10: 100%|██████████| 8/8 [00:01<00:00,  7.71it/s, Loss=0.469]


Time:  10.753646850585938


## Load pretrained Model

In [13]:
# # model_name= "model_1000.pt"
# model_name= "model.pt"

# model, pre_encoder, pre_decoder = tabsyn.load_model(
#     model, pre_encoder, pre_decoder,
#     ckpt_dir = f"{MODEL_PATH}/{DATA_NAME}",
#     model_name = model_name,
# )

In [14]:
steps = raw_config["sample"]["steps"]
save_path = f"{SYNTH_DATA_DIR}/{DATA_NAME}/tabsyn.csv"
ckpt_path = f"{MODEL_PATH}/{DATA_NAME}"

train_z, token_dim = tabsyn.load_vae_embeddings(vae_ckpt_dir = f"{MODEL_PATH}/{DATA_NAME}/vae")

with open(f"{PROCESSED_DATA_DIR}/{DATA_NAME}/info.json", "r") as file:
    data_info = json.load(file)
data_info['token_dim'] = token_dim
task_type = data_info['task_type']

_, _, categories, d_numerical, num_inverse, cat_inverse = preprocess(f"{PROCESSED_DATA_DIR}/{DATA_NAME}", task_type = task_type, inverse = True)

data_path /projects/aieng/diffusion_bootcamp/data/tabular_copy/processed_data/adult
No NaNs in numerical features, skipping


In [15]:
model, pre_decoder = tabsyn.load_model(
    in_dim = train_z.shape[1], hid_dim = train_z.shape[1],
    ckpt_dir = f"{MODEL_PATH}/{DATA_NAME}",
    device = device,
    d_numerical = d_numerical, 
    categories = categories,
    **raw_config["model_params"]["vae"],  
)

data_info['pre_decoder'] = pre_decoder

## Sample Data

In [16]:
tabsyn.sample(model, train_z, data_info, num_inverse, cat_inverse, save_path, device)

(32561, 9)
Time: 15.872112512588501
Saving sampled data to /projects/aieng/diffusion_bootcamp/data/tabular_copy/synthetic_data/adult/tabsyn.csv


## Synthetic Data

Finally here, we review the synthesized data. In the following `evaluate_synthetic_data.ipynb` notebook, we will evaluate this synthesized data with respect to various metrics.

In [17]:
df = pd.read_csv(f"{SYNTH_DATA_DIR}/{DATA_NAME}/tabsyn.csv")

# Display the first few rows of the DataFrame
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,34.0,?,177972.88,HS-grad,12.0,Never-married,Tech-support,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
1,63.0,Private,160227.14,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
2,46.0,Federal-gov,195279.36,HS-grad,11.0,Married-civ-spouse,Exec-managerial,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,55.0,?,155793.06,Bachelors,11.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
4,26.0,Private,169466.77,Some-college,11.0,Married-spouse-absent,Exec-managerial,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K


## References

**Zhang, Hengrui, et al.** "Mixed-type tabular data synthesis with score-based diffusion in latent space." *International Conference on Learning Representations (ICLR)* (2023).

**GitHub Repository:** [Amazon Science - Tabsyn](https://github.com/amazon-science/tabsyn)