In [1]:
!nvidia-smi

Mon Jul 22 19:55:38 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:30:00.0 Off |                    0 |
| N/A   48C    P0    28W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

<center>
  
# TABSYN: Tabular Data Synthesis with Diffusion Models

</center>

Introduction goes here.

# Imports and Setup

In [2]:
import os
import src
import json
import pandas as pd
from pprint import pprint

import torch
from torch.utils.data import DataLoader

from scripts.download_dataset import download_from_uci
from scripts.process_dataset import process_data
from src.data import preprocess, TabularDataset
from src.baselines.tabsyn.pipeline import TabSyn


NAME_URL_DICT_UCI = {
    "adult": "https://archive.ics.uci.edu/static/public/2/adult.zip",
    "default": "https://archive.ics.uci.edu/static/public/350/default+of+credit+card+clients.zip",
    "magic": "https://archive.ics.uci.edu/static/public/159/magic+gamma+telescope.zip",
    "shoppers": "https://archive.ics.uci.edu/static/public/468/online+shoppers+purchasing+intention+dataset.zip",
    "beijing": "https://archive.ics.uci.edu/static/public/381/beijing+pm2+5+data.zip",
    "news": "https://archive.ics.uci.edu/static/public/332/online+news+popularity.zip",
}

DATA_DIR = "/projects/aieng/diffusion_bootcamp/data/tabular_copy"
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw_data")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed_data")
SYNTH_DATA_DIR = os.path.join(DATA_DIR, "synthetic_data")
DATA_NAME = "adult"

MODEL_PATH = "/projects/aieng/diffusion_bootcamp/models/tabular/tabsyn_copy"

# Adult Dataset

For more detailed explaination, please refer to the TabDDPM reference implementation.

In [3]:
# download data
download_from_uci(DATA_NAME, RAW_DATA_DIR, NAME_URL_DICT_UCI)

# process data
INFO_DIR = "data_info"
process_data(DATA_NAME, INFO_DIR, DATA_DIR)

# review data
df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, DATA_NAME, "train.csv"))
df.head(5)

Start processing dataset adult from UCI.
Aready downloaded.
adult (32561, 15) (16281, 15) (32561, 15)
Numerical (32561, 6)
Categorical (32561, 8)
Processing and Saving adult Successfully!
adult
Total 48842
Train 32561
Test 16281
Num 6
Cat 9


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [4]:
# clean data
value = " ?"
if value in df.values:
    print(f"{value} exists in the DataFrame.")
else:
    print(f"{value} does not exist in the DataFrame.")

# review json file and its contents
with open(f"{PROCESSED_DATA_DIR}/{DATA_NAME}/info.json", "r") as file:
    data_info = json.load(file)
pprint(data_info)

 ? exists in the DataFrame.
{'cat_col_idx': [1, 3, 5, 6, 7, 8, 9, 13],
 'column_info': {'0': {},
                 '1': {},
                 '10': {},
                 '11': {},
                 '12': {},
                 '13': {},
                 '14': {},
                 '2': {},
                 '3': {},
                 '4': {},
                 '5': {},
                 '6': {},
                 '7': {},
                 '8': {},
                 '9': {},
                 'categorizes': [' <=50K', ' >50K'],
                 'max': 99.0,
                 'min': 1.0,
                 'type': 'categorical'},
 'column_names': ['age',
                  'workclass',
                  'fnlwgt',
                  'education',
                  'education.num',
                  'marital.status',
                  'occupation',
                  'relationship',
                  'race',
                  'sex',
                  'capital.gain',
                  'capital.loss',
          

# TabSyn Algorithem

Explanation of the algorithm goes here.

<p align="center">
<img src="figures/tabsyn.jpg" width="1000"/>
</p>

## Load Config

In [5]:
config_path = os.path.join("src/baselines/tabsyn/configs", f"{DATA_NAME}.toml")
raw_config = src.load_config(config_path)

pprint(raw_config)

{'loss_params': {'lambd': 0.7, 'max_beta': 0.01, 'min_beta': 1e-05},
 'model_params': {'d_token': 4, 'factor': 32, 'n_head': 1, 'num_layers': 2},
 'sample': {'steps': 50},
 'task_type': 'binclass',
 'train': {'diffusion': {'batch_size': 4096,
                         'num_dataset_workers': 4,
                         'num_epochs': 9},
           'optim': {'diffusion': {'factor': 0.9,
                                   'lr': 0.001,
                                   'patience': 20,
                                   'weight_decay': 0},
                     'vae': {'factor': 0.95,
                             'lr': 0.001,
                             'patience': 10,
                             'weight_decay': 0}},
           'vae': {'batch_size': 4096,
                   'num_dataset_workers': 4,
                   'num_epochs': 10}}}


## Make Dataset

In [6]:
# preprocess data
X_num, X_cat, categories, d_numerical = preprocess(os.path.join(PROCESSED_DATA_DIR, DATA_NAME), task_type=raw_config["task_type"])

# separate train and test data
X_train_num, X_test_num = X_num
X_train_cat, X_test_cat = X_cat

# convert to float tensor
X_train_num, X_test_num = torch.tensor(X_train_num).float(), torch.tensor(X_test_num).float()
X_train_cat, X_test_cat =  torch.tensor(X_train_cat), torch.tensor(X_test_cat)

# create dataset module
train_data = TabularDataset(X_train_num.float(), X_train_cat)

# move test data to gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_test_num = X_test_num.float().to(device)
X_test_cat = X_test_cat.to(device)

# create train dataloader
train_loader = DataLoader(
    train_data,
    batch_size = raw_config["train"]["vae"]["batch_size"],
    shuffle = True,
    num_workers = raw_config["train"]["vae"]["num_dataset_workers"],
)

data_path /projects/aieng/diffusion_bootcamp/data/tabular_copy/processed_data/adult
No NaNs in numerical features, skipping


In [7]:
print("train dataset num shape: ", train_data.X_num.shape)
print("test dataset num shape: ", X_test_num.shape)

print("train dataset cat shape: ", train_data.X_cat.shape)
print("test dataset cat shape: ", X_test_cat.shape)

train dataset num shape:  torch.Size([32561, 6])
test dataset num shape:  torch.Size([16281, 6])
train dataset cat shape:  torch.Size([32561, 9])
test dataset cat shape:  torch.Size([16281, 9])


## Instantiate Model

In [8]:
tabsyn = TabSyn(train_loader,
                X_test_num, X_test_cat,
                num_numerical_features = d_numerical,
                num_classes = categories,
                device = device)

## Train Model
### First: Train VAE

In [9]:
# laod and prepare VAE model for training
tabsyn.load_vae_model(**raw_config["model_params"], optim_params = raw_config["train"]["optim"]["vae"])

self.category_embeddings.weight.shape=torch.Size([104, 4])
self.category_embeddings.weight.shape=torch.Size([104, 4])
Successfully loaded VAE model.


In [10]:
tabsyn.train_vae(**raw_config["loss_params"],
                 num_epochs = raw_config["train"]["vae"]["num_epochs"],
                 model_save_path = os.path.join(MODEL_PATH, DATA_NAME, "vae", "model.pt"),
                 encoder_save_path = os.path.join(MODEL_PATH, DATA_NAME, "vae", "encoder.pt"),
                 decoder_save_path = os.path.join(MODEL_PATH, DATA_NAME, "vae", "decoder.pt"))

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1/10: 100%|██████████| 8/8 [00:02<00:00,  3.89it/s]


epoch: 0, beta = 0.010000, Train MSE: 9.382177, Train CE:2.187658, Train KL:0.484915, Val MSE:7.060134, Val CE:2.249630, Train ACC:0.242507, Val ACC:0.241379


Epoch 2/10: 100%|██████████| 8/8 [00:01<00:00,  7.17it/s]


epoch: 1, beta = 0.010000, Train MSE: 5.905682, Train CE:2.285896, Train KL:0.848383, Val MSE:4.568557, Val CE:2.285522, Train ACC:0.235736, Val ACC:0.237694


Epoch 3/10: 100%|██████████| 8/8 [00:01<00:00,  7.25it/s]


epoch: 2, beta = 0.010000, Train MSE: 3.711805, Train CE:2.235956, Train KL:1.188504, Val MSE:2.709688, Val CE:2.179030, Train ACC:0.241222, Val ACC:0.251049


Epoch 4/10: 100%|██████████| 8/8 [00:01<00:00,  7.41it/s]


epoch: 3, beta = 0.010000, Train MSE: 2.134012, Train CE:2.168019, Train KL:1.463033, Val MSE:1.521422, Val CE:2.147592, Train ACC:0.286906, Val ACC:0.290243


Epoch 5/10: 100%|██████████| 8/8 [00:01<00:00,  7.43it/s]


epoch: 4, beta = 0.010000, Train MSE: 1.191864, Train CE:2.118469, Train KL:1.740865, Val MSE:0.888179, Val CE:2.057896, Train ACC:0.300649, Val ACC:0.303373


Epoch 6/10: 100%|██████████| 8/8 [00:01<00:00,  7.47it/s]


epoch: 5, beta = 0.010000, Train MSE: 0.736978, Train CE:1.997076, Train KL:1.947789, Val MSE:0.589910, Val CE:1.913915, Train ACC:0.296077, Val ACC:0.297504


Epoch 7/10: 100%|██████████| 8/8 [00:01<00:00,  7.38it/s]


epoch: 6, beta = 0.010000, Train MSE: 0.500361, Train CE:1.869850, Train KL:2.065991, Val MSE:0.416524, Val CE:1.821952, Train ACC:0.296592, Val ACC:0.300923


Epoch 8/10: 100%|██████████| 8/8 [00:01<00:00,  6.89it/s]


epoch: 7, beta = 0.010000, Train MSE: 0.368829, Train CE:1.800204, Train KL:2.249681, Val MSE:0.328609, Val CE:1.762079, Train ACC:0.327305, Val ACC:0.334111


Epoch 9/10: 100%|██████████| 8/8 [00:01<00:00,  7.29it/s]


epoch: 8, beta = 0.010000, Train MSE: 0.303160, Train CE:1.732979, Train KL:2.429506, Val MSE:0.276827, Val CE:1.688931, Train ACC:0.368932, Val ACC:0.375578


Epoch 10/10: 100%|██████████| 8/8 [00:01<00:00,  7.10it/s]


epoch: 9, beta = 0.010000, Train MSE: 0.251374, Train CE:1.662566, Train KL:2.566030, Val MSE:0.228855, Val CE:1.621897, Train ACC:0.399874, Val ACC:0.410492
Training time: 0.2119 mins
Successfully trained and saved the VAE model!


In [11]:
# embed all inputs in the latent space
tabsyn.save_vae_embeddings(X_train_num, X_train_cat,
                           vae_ckpt_dir = os.path.join(MODEL_PATH, DATA_NAME, "vae"))

Successfully saved pretrained embeddings on disk!


### Second: Train Diffusion Model

In [12]:
# load latent space embeddings
train_z, _ = tabsyn.load_vae_embeddings(os.path.join(MODEL_PATH, DATA_NAME, "vae"))  # train_z dim: B x in_dim

# normalize embeddings
mean, std = train_z.mean(0), train_z.std(0)
train_z = (train_z - mean) / 2
latent_train_data = train_z

# create data loader
latent_train_loader = DataLoader(
    latent_train_data,
    batch_size = raw_config["train"]["diffusion"]["batch_size"],
    shuffle = True,
    num_workers = raw_config["train"]["diffusion"]["num_dataset_workers"],
)

In [13]:
# load and prepare diffusion model for training
tabsyn.load_diffusion_model(in_dim = train_z.shape[1], hid_dim = train_z.shape[1], optim_params = raw_config["train"]["optim"]["diffusion"])

MLPDiffusion(
  (proj): Linear(in_features=60, out_features=1024, bias=True)
  (mlp): Sequential(
    (0): Linear(in_features=1024, out_features=2048, bias=True)
    (1): SiLU()
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): SiLU()
    (4): Linear(in_features=2048, out_features=1024, bias=True)
    (5): SiLU()
    (6): Linear(in_features=1024, out_features=60, bias=True)
  )
  (map_noise): PositionalEmbedding()
  (time_embed): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): SiLU()
    (2): Linear(in_features=1024, out_features=1024, bias=True)
  )
)
The number of parameters: 10616892
Successfully loaded diffusion model.


In [14]:
# train diffusion model
tabsyn.train_diffusion(latent_train_loader,
                       num_epochs = raw_config["train"]["diffusion"]["num_epochs"],
                       ckpt_path = os.path.join(MODEL_PATH, DATA_NAME))

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1/9: 100%|██████████| 8/8 [00:00<00:00,  8.79it/s, Loss=0.745]
Epoch 2/9: 100%|██████████| 8/8 [00:01<00:00,  6.44it/s, Loss=0.675]
Epoch 3/9: 100%|██████████| 8/8 [00:01<00:00,  6.87it/s, Loss=0.634]
Epoch 4/9: 100%|██████████| 8/8 [00:01<00:00,  6.97it/s, Loss=0.594]
Epoch 5/9: 100%|██████████| 8/8 [00:01<00:00,  6.83it/s, Loss=0.567]
Epoch 6/9: 100%|██████████| 8/8 [00:01<00:00,  6.75it/s, Loss=0.604]
Epoch 7/9: 100%|██████████| 8/8 [00:01<00:00,  6.74it/s, Loss=0.524]
Epoch 8/9: 100%|██████████| 8/8 [00:01<00:00,  6.33it/s, Loss=0.483]
Epoch 9/9: 100%|██████████| 8/8 [00:01<00:00,  6.75it/s, Loss=0.458]


Time:  11.236772537231445


## Load pretrained Model

In [15]:
import importlib
import src.baselines.tabsyn.pipeline as pipeline
importlib.reload(pipeline)
TabSyn = getattr(pipeline, "TabSyn")

In [16]:
# # model_name= "model_1000.pt"
# model_name= "model.pt"

# model, pre_encoder, pre_decoder = tabsyn.load_model(
#     model, pre_encoder, pre_decoder,
#     ckpt_dir = f"{MODEL_PATH}/{DATA_NAME}",
#     model_name = model_name,
# )

In [17]:
# load latent embeddings of input data
train_z, token_dim = tabsyn.load_vae_embeddings(os.path.join(MODEL_PATH, DATA_NAME, "vae"))

# load json file
with open(os.path.join(PROCESSED_DATA_DIR, DATA_NAME, "info.json"), "r") as file:
    data_info = json.load(file)
data_info["token_dim"] = token_dim

# get inverse tokenizers
_, _, categories, d_numerical, num_inverse, cat_inverse = preprocess(os.path.join(PROCESSED_DATA_DIR, DATA_NAME),
                                                                     task_type = data_info["task_type"],
                                                                     inverse = True)

data_path /projects/aieng/diffusion_bootcamp/data/tabular_copy/processed_data/adult
No NaNs in numerical features, skipping


In [18]:
# load pretrained diffusion model
dif_model, pre_decoder = tabsyn.load_model_for_sampling(
    in_dim = train_z.shape[1],
    hid_dim = train_z.shape[1],
    d_numerical = d_numerical, 
    categories = categories,
    ckpt_dir = os.path.join(MODEL_PATH, DATA_NAME),
    **raw_config["model_params"],  
)

data_info["pre_decoder"] = pre_decoder


## Sample Data

In [19]:
tabsyn.sample(train_z,
              info = data_info,
              num_inverse = num_inverse,
              cat_inverse = cat_inverse,
              save_path = os.path.join(SYNTH_DATA_DIR, DATA_NAME, "tabsyn.csv"))

(32561, 9)
Time: 15.53997278213501
Saving sampled data to /projects/aieng/diffusion_bootcamp/data/tabular_copy/synthetic_data/adult/tabsyn.csv


## Synthetic Data

Finally here, we review the synthesized data. In the following `evaluate_synthetic_data.ipynb` notebook, we will evaluate this synthesized data with respect to various metrics.

In [20]:
df = pd.read_csv(os.path.join(SYNTH_DATA_DIR, DATA_NAME, "tabsyn.csv"))

# Display the first few rows of the DataFrame
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,44.0,Federal-gov,174733.2,HS-grad,6.0,Married-civ-spouse,Adm-clerical,Not-in-family,White,Male,0.0,0.0,50.0,Holand-Netherlands,<=50K
1,27.0,Self-emp-not-inc,214106.8,HS-grad,9.0,Never-married,Transport-moving,Not-in-family,White,Male,0.0,0.0,40.0,Holand-Netherlands,<=50K
2,48.0,Federal-gov,272498.5,HS-grad,4.0,Married-civ-spouse,Transport-moving,Not-in-family,White,Male,0.0,0.0,30.0,Vietnam,<=50K
3,33.0,Never-worked,208416.62,9th,9.0,Widowed,Transport-moving,Not-in-family,White,Male,0.0,0.0,44.0,Holand-Netherlands,<=50K
4,34.0,Self-emp-not-inc,126882.2,Assoc-voc,10.0,Never-married,Adm-clerical,Not-in-family,White,Male,0.0,0.0,50.0,Holand-Netherlands,<=50K


## References

**Zhang, Hengrui, et al.** "Mixed-type tabular data synthesis with score-based diffusion in latent space." *International Conference on Learning Representations (ICLR)* (2023).

**GitHub Repository:** [Amazon Science - Tabsyn](https://github.com/amazon-science/tabsyn)