In [44]:
import pandas as pd
from scipy.io import arff
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import sys
import os
sys.path.append(os.path.abspath('../'))


In [45]:
file_path = 'data/raw/phpOJxGL9.arff'

data, meta = arff.loadarff(file_path)
df = pd.DataFrame(data)

df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,Class
0,65.0,Female,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1
1,62.0,Male,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,62.0,Male,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,58.0,Male,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1
4,72.0,Male,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1


In [46]:
df_train = df[['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V2']]

In [79]:
def prepare_dataset(df, categorical_cols=None, test_size=0.2):
    df = df.copy()
    
    # Detect numeric and categorical
    if categorical_cols is None:
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = df.columns.difference(categorical_cols).tolist()

    # Encode categorical
    encoder = OrdinalEncoder()
    df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

    # Scale numerical
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Convert to torch
    X = torch.tensor(df.values, dtype=torch.float32)

    data_loader = DataLoader(TensorDataset(X), batch_size=256, shuffle=True)

    return data_loader, len(numerical_cols), [int(df[col].nunique()) for col in categorical_cols]


In [80]:
from torch import nn
import torch.nn.functional as F
from src.tabddpm.modules import timestep_embedding, MLP

class MLPDiffusion(nn.Module):
    """
    A simplified MLP-based diffusion model for tabular data.

    This model uses timestep embeddings and a projection layer
    to inject time information into the input before processing it
    through an MLP. It is intended for unsupervised tasks like clustering,
    where label conditioning is not required.

    Args:
        d_in (int): Input feature dimension.
        rtdl_params (dict): Parameters for the RTDL MLP (e.g., hidden sizes, depth).
        dim_t (int): Dimensionality for the timestep embedding and projection space.
    """
    def __init__(self, d_in, rtdl_params, dim_t=128):
        super().__init__()
        self.dim_t = dim_t

        # Configure MLP: input will be timestep-embedded, output must match original input size
        rtdl_params['d_in'] = dim_t
        rtdl_params['d_out'] = d_in
        self.mlp = MLP.make_baseline(**rtdl_params)

        # Project input features into timestep embedding space
        self.proj = nn.Linear(d_in, dim_t)

        # Timestep embedding network (2-layer MLP with SiLU activation)
        self.time_embed = nn.Sequential(
            nn.Linear(dim_t, dim_t),
            nn.SiLU(),
            nn.Linear(dim_t, dim_t)
        )

    def forward(self, x, timesteps):
        """
        Forward pass of the MLPDiffusion model.

        Args:
            x (Tensor): Input tensor of shape (batch_size, d_in).
            timesteps (Tensor): Timestep tensor of shape (batch_size,) representing the diffusion step.

        Returns:
            Tensor: Output tensor of shape (batch_size, d_in), same as the input dimension.
        """
        # Get timestep embedding (e.g., sinusoidal), then pass through a small MLP
        emb = self.time_embed(timestep_embedding(timesteps, self.dim_t))

        # Project input to timestep embedding space and add time information
        x = self.proj(x) + emb

        # Pass through MLP and return
        return self.mlp(x)


In [81]:
from src.tabddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion

def get_diffusion_model(model, num_classes, num_numerical, device):
    diffusion = GaussianMultinomialDiffusion(
        num_classes=np.array(num_classes),
        num_numerical_features=num_numerical,
        denoise_fn=model,
        gaussian_loss_type='mse',
        num_timesteps=1000,
        scheduler='cosine',
        device=device
    )
    diffusion.to(device)
    return diffusion


In [82]:
from copy import deepcopy
import torch.optim as optim

class Trainer:
    def __init__(self, model, train_loader, steps=1000, lr=1e-3, device='cuda'):
        self.model = model
        self.train_loader = train_loader
        self.steps = steps
        self.device = device
        self.optimizer = optim.AdamW(model.parameters(), lr=lr)
        self.ema_model = deepcopy(model._denoise_fn)
        for p in self.ema_model.parameters():
            p.requires_grad = False

    def train(self):
        step = 0
        iterator = iter(self.train_loader)

        while step < self.steps:
            try:
                x_batch, = next(iterator)
            except StopIteration:
                iterator = iter(self.train_loader)
                x_batch, = next(iterator)

            x_batch = x_batch.to(self.device)

            self.optimizer.zero_grad()
            loss_multi, loss_gauss = self.model.mixed_loss(x_batch, {})
            loss = loss_multi + loss_gauss
            loss.backward()
            self.optimizer.step()

            if step % 100 == 0:
                print(f"[{step}/{self.steps}] Loss: {loss.item():.4f}")

            step += 1


In [161]:
import torch

# Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_loader, num_numerical, num_classes = prepare_dataset(df_train, ['V2'])

model_params = {
    'num_classes': 0,
    'is_y_cond': False,
    'rtdl_params': {
        'd_layers': [256, 256, 256],
        'dropout': 0.1
    }
}

mlp_model = MLPDiffusion(
    d_in=num_numerical + sum(num_classes),
    rtdl_params=model_params['rtdl_params']
)

mlp_model.to(device)

diffusion = get_diffusion_model(mlp_model, num_classes, num_numerical, device)

trainer = Trainer(diffusion, data_loader, steps=10000, device=device, lr=0.001)
trainer.train()


[0/10000] Loss: 2.0804
[100/10000] Loss: 1.3626
[200/10000] Loss: 1.1430
[300/10000] Loss: 0.9241
[400/10000] Loss: 1.1916
[500/10000] Loss: 1.1197
[600/10000] Loss: 1.7262
[700/10000] Loss: 1.1031
[800/10000] Loss: 0.8292
[900/10000] Loss: 1.0296
[1000/10000] Loss: 0.8111
[1100/10000] Loss: 0.6842
[1200/10000] Loss: 0.9405
[1300/10000] Loss: 0.9285
[1400/10000] Loss: 0.6610
[1500/10000] Loss: 2.2003
[1600/10000] Loss: 1.2534
[1700/10000] Loss: 0.5666
[1800/10000] Loss: 0.8932
[1900/10000] Loss: 0.7977
[2000/10000] Loss: 1.3403
[2100/10000] Loss: 0.6007
[2200/10000] Loss: 0.7112
[2300/10000] Loss: 0.8281
[2400/10000] Loss: 0.8562
[2500/10000] Loss: 0.7708
[2600/10000] Loss: 0.6067
[2700/10000] Loss: 0.9063
[2800/10000] Loss: 0.9378
[2900/10000] Loss: 0.6581
[3000/10000] Loss: 0.7570
[3100/10000] Loss: 0.9283
[3200/10000] Loss: 0.9267
[3300/10000] Loss: 0.8281
[3400/10000] Loss: 1.2461
[3500/10000] Loss: 0.6125
[3600/10000] Loss: 0.7461
[3700/10000] Loss: 0.9293
[3800/10000] Loss: 0.836

In [None]:
def index_to_log_onehot(x, num_classes):
    onehots = []
    for i in range(len(num_classes)):
        onehots.append(F.one_hot(x[:, i], num_classes[i]))
 
    x_onehot = torch.cat(onehots, dim=1)
    log_onehot = torch.log(x_onehot.float().clamp(min=1e-30))
    return log_onehot

x = next(iter(data_loader))[0]
b = x.shape[0]
t, pt = diffusion.sample_time(b, device, 'uniform')

x_num = x[:, :diffusion.num_numerical_features]
x_cat = x[:, diffusion.num_numerical_features:]

x_num_t = x_num
log_x_cat_t = x_cat
if x_num.shape[1] > 0:
    noise = torch.randn_like(x_num)
    x_num_t = diffusion.gaussian_q_sample(x_num, t, noise=noise)
if x_cat.shape[1] > 0:
    log_x_cat = index_to_log_onehot(x_cat.long(), diffusion.num_classes)
    log_x_cat_t = diffusion.q_sample(log_x_start=log_x_cat, t=t)

x_in = torch.cat([x_num_t, log_x_cat_t], dim=1)

model_out = diffusion._denoise_fn(
    x_in,
    t
)

print(x)

print(model_out)


tensor([[ 0.2012, -0.3061, -0.3158,  ..., -1.1850, -1.0592,  1.0000],
        [ 0.9430,  1.2413,  1.2166,  ..., -1.3108, -1.0906,  1.0000],
        [-1.7153, -0.4511, -0.4940,  ...,  1.7087,  2.3661,  0.0000],
        ...,
        [-1.1589, -0.4350, -0.4583,  ..., -0.9333, -0.6193,  0.0000],
        [-0.7880,  3.1271,  3.1054,  ..., -0.4301, -0.7764,  1.0000],
        [-0.2934, -0.4189, -0.4940,  ..., -0.6817,  0.1664,  1.0000]])
tensor([[ 0.0989,  0.5668, -0.5592,  ..., -0.7411, -0.5338,  0.5286],
        [ 0.0085,  0.2609, -0.0081,  ...,  0.3516, -1.7337,  1.1427],
        [-0.1041, -0.1533,  0.2487,  ...,  0.0658, -0.6076,  0.5510],
        ...,
        [-1.0366, -0.3861,  0.4864,  ..., -0.6828, -0.6099,  0.3700],
        [ 0.1357,  0.5098, -0.1787,  ..., -0.4231, -1.6590,  2.2879],
        [ 0.1465, -1.6179,  0.8162,  ..., -0.4069, -0.4884,  0.5847]],
       grad_fn=<AddmmBackward0>)


In [None]:
model_out_cat = model_out[:, diffusion.num_numerical_features:]
model_out_cat

tensor([[-0.5338,  0.5286],
        [-1.7337,  1.1427],
        [-0.6076,  0.5510],
        [-1.4578,  1.3563],
        [-0.6501,  0.7371],
        [-1.1912,  1.1144],
        [-0.6940,  0.4939],
        [-0.5740,  0.7254],
        [-0.3510,  0.9638],
        [-0.6088,  0.9492],
        [ 2.9420, -2.4517],
        [-0.6559,  0.6354],
        [-0.2866,  0.2872],
        [-1.2745,  1.1057],
        [-0.5235,  0.6040],
        [-0.1400,  1.1176],
        [ 0.5576, -0.4885],
        [-3.2497,  4.2588],
        [-0.3476,  0.4398],
        [-0.8132,  0.3300],
        [-0.4699,  0.7250],
        [-0.1434,  0.8091],
        [-0.7831,  0.4507],
        [-0.4659,  0.0859],
        [-0.4265,  0.4140],
        [-0.3229,  0.5318],
        [-0.8175,  0.9610],
        [-0.2345,  0.5297],
        [-0.5385,  0.5333],
        [-0.6286,  0.7021],
        [-0.8803,  0.6137],
        [-0.4869,  0.1292],
        [-0.2821,  0.3958],
        [-0.6323,  0.7234],
        [-0.2907,  0.3417],
        [-0.4034,  0

In [None]:
def evaluate_mixed_loss(dm, data_loader, device):
    dm.eval()
    total_loss_multi = 0.0
    total_loss_gauss = 0.0
    total_samples = 0

    with torch.no_grad():
        for batch, in data_loader:
            x_batch = batch.to(device).float()  # adjust if your loader returns tuple, etc.
            b = x_batch.size(0)
            total_samples += b

            # empty dict if no conditions, else pass your dict
            out_dict = {}

            loss_multi, loss_gauss = dm.mixed_loss(x_batch, out_dict)

            # loss_multi and loss_gauss are mean per batch, multiply by batch size for sum
            total_loss_multi += loss_multi.item() * b
            total_loss_gauss += loss_gauss.item() * b

    return (total_loss_multi + total_loss_gauss) / total_samples


In [185]:
def random_mixed_loss(dm, x, out_dict):
    b = x.shape[0]
    device = x.device
    t, pt = dm.sample_time(b, device, 'uniform')

    x_num = x[:, :dm.num_numerical_features]
    x_cat = x[:, dm.num_numerical_features:]
    
    x_num_t = x_num
    log_x_cat_t = x_cat
    if x_num.shape[1] > 0:
        noise = torch.randn_like(x_num)
        x_num_t = dm.gaussian_q_sample(x_num, t, noise=noise)
    if x_cat.shape[1] > 0:
        log_x_cat = index_to_log_onehot(x_cat.long(), dm.num_classes)
        log_x_cat_t = dm.q_sample(log_x_start=log_x_cat, t=t)
    
    x_in = torch.cat([x_num_t, log_x_cat_t], dim=1)

    model_out = torch.randn_like(x_in)

    model_out_num = model_out[:, :dm.num_numerical_features]
    model_out_cat = model_out[:, dm.num_numerical_features:]

    loss_multi = torch.zeros((1,)).float()
    loss_gauss = torch.zeros((1,)).float()
    if x_cat.shape[1] > 0:
        loss_multi = dm._multinomial_loss(model_out_cat, log_x_cat, log_x_cat_t, t, pt, out_dict) / len(dm.num_classes)
    
    if x_num.shape[1] > 0:
        loss_gauss = dm._gaussian_loss(model_out_num, x_num, x_num_t, t, noise)

    # loss_multi = torch.where(out_dict['y'] == 1, loss_multi, 2 * loss_multi)
    # loss_gauss = torch.where(out_dict['y'] == 1, loss_gauss, 2 * loss_gauss)

    return loss_multi.mean(), loss_gauss.mean()

In [186]:
def evaluate_random_mixed_loss(dm, data_loader, device):
    dm.eval()
    total_loss_multi = 0.0
    total_loss_gauss = 0.0
    total_samples = 0

    with torch.no_grad():
        for batch, in data_loader:
            x_batch = batch.to(device).float()  # adjust if your loader returns tuple, etc.
            b = x_batch.size(0)
            total_samples += b

            # empty dict if no conditions, else pass your dict
            out_dict = {}

            loss_multi, loss_gauss = random_mixed_loss(dm, x_batch, out_dict)

            # loss_multi and loss_gauss are mean per batch, multiply by batch size for sum
            total_loss_multi += loss_multi.item() * b
            total_loss_gauss += loss_gauss.item() * b

    return (total_loss_multi + total_loss_gauss) / total_samples


In [187]:
loss= evaluate_mixed_loss(diffusion, data_loader, device)
print(f"DM dataset loss: {loss:.4f}")

loss= evaluate_random_mixed_loss(diffusion, data_loader, device)
print(f"DM dataset random loss: {loss:.4f}")


DM dataset loss: 0.8420
DM dataset random loss: 2.8375
