In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
import sys
import os

sys.path.append(os.path.abspath('../'))


In [None]:
file_path = 'data/preprocessed/1480/data_processed.csv'

df = pd.read_csv(file_path)

df.head()

X = torch.tensor(df.values, dtype=torch.float32)
batch_size = 264
data_loader = DataLoader(TensorDataset(X), batch_size=batch_size, shuffle=True)

Unnamed: 0,V1,V10,V3,V4,V5,V6,V7,V8,V9,V2
0,1.19094,-0.151318,-1.016737,-5.199338,-0.444948,-1.411573,-1.350125,0.188461,0.212631,0.0
1,1.020355,-0.645041,1.405765,1.423335,1.646522,0.740885,0.80178,0.970866,0.110048,1.0
2,1.020355,-0.28371,1.235902,1.288435,1.231294,0.655684,0.493028,0.440198,0.212631,1.0
3,0.707298,0.270284,0.0,0.11655,-0.534807,-1.746316,-1.144112,0.188461,0.306203,1.0
4,1.672115,-1.991811,0.930265,0.926951,-0.245794,-0.400208,0.376984,0.792902,-0.907275,1.0


In [291]:
from torch import nn
import torch.nn.functional as F
from src.tabddpm.modules import timestep_embedding, MLP

class MLPDiffusion(nn.Module):
    """
    A simplified MLP-based diffusion model for tabular data.

    This model uses timestep embeddings and a projection layer
    to inject time information into the input before processing it
    through an MLP. It is intended for unsupervised tasks like clustering,
    where label conditioning is not required.

    Args:
        d_in (int): Input feature dimension.
        rtdl_params (dict): Parameters for the RTDL MLP (e.g., hidden sizes, depth).
        dim_t (int): Dimensionality for the timestep embedding and projection space.
    """
    def __init__(self, d_in, d_layers, dropout, d_t):
        super().__init__()
        self.dim_t = d_t

        # Configure MLP: input will be timestep-embedded, output must match original input size
        self.mlp = MLP.make_baseline(d_t, d_layers, dropout, d_in)

        # Project input features into timestep embedding space
        self.proj = nn.Linear(d_in, d_t)

        # Timestep embedding network (2-layer MLP with SiLU activation)
        self.time_embed = nn.Sequential(
            nn.Linear(d_t, d_t),
            nn.SiLU(),
            nn.Linear(d_t, d_t)
        )

    def forward(self, x, timesteps):
        """
        Forward pass of the MLPDiffusion model.

        Args:
            x (Tensor): Input tensor of shape (batch_size, d_in).
            timesteps (Tensor): Timestep tensor of shape (batch_size,) representing the diffusion step.

        Returns:
            Tensor: Output tensor of shape (batch_size, d_in), same as the input dimension.
        """
        # Get timestep embedding (e.g., sinusoidal), then pass through a small MLP
        emb = self.time_embed(timestep_embedding(timesteps, self.dim_t))

        # Project input to timestep embedding space and add time information
        x = self.proj(x) + emb

        # Pass through MLP and return
        return self.mlp(x)


In [292]:
from src.tabddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion

def get_diffusion_model(model, num_classes, num_numerical, device):
    diffusion = GaussianMultinomialDiffusion(
        num_classes=np.array(num_classes),
        num_numerical_features=num_numerical,
        denoise_fn=model,
        gaussian_loss_type='mse',
        num_timesteps=1000,
        scheduler='cosine',
        device=device
    )
    diffusion.to(device)
    return diffusion


In [293]:
from copy import deepcopy
import torch.optim as optim

class Trainer:
    def __init__(self, model, train_loader, steps=1000, lr=1e-3, device='cuda'):
        self.model = model
        self.train_loader = train_loader
        self.steps = steps
        self.device = device
        self.optimizer = optim.AdamW(model.parameters(), lr=lr)
        self.ema_model = deepcopy(model._denoise_fn)
        for p in self.ema_model.parameters():
            p.requires_grad = False

    def train(self):
        step = 0
        iterator = iter(self.train_loader)

        while step < self.steps:
            try:
                x_batch, = next(iterator)
            except StopIteration:
                iterator = iter(self.train_loader)
                x_batch, = next(iterator)

            x_batch = x_batch.to(self.device)

            self.optimizer.zero_grad()
            loss_multi, loss_gauss = self.model.mixed_loss(x_batch, {})
            loss = loss_multi + loss_gauss
            loss.backward()
            self.optimizer.step()

            if step % 100 == 0:
                print(f"[{step}/{self.steps}] Loss: {loss.item():.4f}")

            step += 1


In [294]:
import torch

# Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 256
steps = 10000
lr = 0.001

d_layers = [256, 256, 256]
dropout = 0.0
d_t = 128

data_loader, num_numerical, num_classes = prepare_dataset(df_train, ['V2'], batch_size)


mlp_model = MLPDiffusion(
    d_in=num_numerical + sum(num_classes),
    d_layers=d_layers,
    dropout=dropout,
    d_t=d_t
    )

mlp_model.to(device)

diffusion = get_diffusion_model(mlp_model, num_classes, num_numerical, device)

trainer = Trainer(diffusion, data_loader, steps=steps, device=device, lr=lr)
trainer.train()




[0/10000] Loss: 1.6829
[100/10000] Loss: 1.4561
[200/10000] Loss: 0.8749
[300/10000] Loss: 0.9362
[400/10000] Loss: 1.1499
[500/10000] Loss: 1.0311
[600/10000] Loss: 1.1920
[700/10000] Loss: 0.8606
[800/10000] Loss: 0.8757
[900/10000] Loss: 0.9447
[1000/10000] Loss: 0.7220
[1100/10000] Loss: 0.7616
[1200/10000] Loss: 1.1235
[1300/10000] Loss: 0.7658
[1400/10000] Loss: 1.0784
[1500/10000] Loss: 0.8105
[1600/10000] Loss: 0.9295
[1700/10000] Loss: 0.9923
[1800/10000] Loss: 0.9349
[1900/10000] Loss: 0.8035
[2000/10000] Loss: 0.7384
[2100/10000] Loss: 0.9547
[2200/10000] Loss: 0.8283
[2300/10000] Loss: 0.7792
[2400/10000] Loss: 0.7851
[2500/10000] Loss: 1.5325
[2600/10000] Loss: 1.1241
[2700/10000] Loss: 0.8784
[2800/10000] Loss: 0.9501
[2900/10000] Loss: 0.8155
[3000/10000] Loss: 0.8488
[3100/10000] Loss: 0.9240
[3200/10000] Loss: 0.7233
[3300/10000] Loss: 0.9234
[3400/10000] Loss: 0.9506
[3500/10000] Loss: 0.8120
[3600/10000] Loss: 0.8311
[3700/10000] Loss: 0.7130
[3800/10000] Loss: 1.174

In [233]:
import copy

def train_and_eval(d_layers, dropout, d_t, data_loader, num_numerical, num_classes, device, steps=1000, lr=1e-3):
    mlp_model = MLPDiffusion(
        d_in=num_numerical + sum(num_classes),
        d_layers=d_layers,
        dropout=dropout,
        d_t=d_t
    )
    mlp_model.to(device)

    diffusion = get_diffusion_model(mlp_model, num_classes, num_numerical, device)
    diffusion.train()

    optimizer = torch.optim.Adam(diffusion.parameters(), lr=lr)
    total_loss = 0.0
    count = 0

    for step, (x_batch, ) in enumerate(data_loader):
        if step >= steps:
            break

        x_batch = x_batch.to(device).float()
        optimizer.zero_grad()
        loss_multi, loss_gauss = diffusion.mixed_loss(x_batch, {})
        loss = loss_multi + loss_gauss
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(x_batch)
        count += len(x_batch)

    avg_loss = total_loss / count
    return avg_loss, copy.deepcopy(diffusion.state_dict())

# Hyperparameter search space
hidden_layer_options = [
    [128, 128],
    [256, 256],
    [256, 256, 256],
    [512, 256, 128]
]
dropout_options = [0.0, 0.1, 0.2]
d_t_options = [64, 128, 256]

best_loss = float('inf')
best_params = None
best_state = None

for d_layers in hidden_layer_options:
    for dropout in dropout_options:
        for d_t in d_t_options:
            print(f"Training with d_layers={d_layers}, dropout={dropout}, d_t={d_t}")
            avg_loss, state = train_and_eval(
                d_layers=d_layers,
                dropout=dropout,
                d_t=d_t,
                data_loader=data_loader,
                num_numerical=num_numerical,
                num_classes=num_classes,
                device=device,
                steps=1000,
                lr=0.0001
            )
            print(f"Avg Loss: {avg_loss:.4f}\n")

            if avg_loss < best_loss:
                best_loss = avg_loss
                best_params = (d_layers, dropout, d_t)
                best_state = state

print("Best configuration:")
print(f"Layers: {best_params[0]}, Dropout: {best_params[1]}, d_t: {best_params[2]}")
print(f"Best loss: {best_loss:.4f}")

# To reload best model later:
best_model = MLPDiffusion(
    d_in=num_numerical + sum(num_classes),
    d_layers=best_params[0],
    dropout=best_params[1],
    d_t=best_params[2]
).to(device)
mlp_state_dict = {k[len('_denoise_fn.'):]: v for k, v in best_state.items() if k.startswith('_denoise_fn.')}

# Now load only the MLPDiffusion weights
best_model.load_state_dict(mlp_state_dict)


Training with d_layers=[128, 128], dropout=0.0, d_t=64
Avg Loss: 2.5348

Training with d_layers=[128, 128], dropout=0.0, d_t=128
Avg Loss: 3.1556

Training with d_layers=[128, 128], dropout=0.0, d_t=256
Avg Loss: 2.3861

Training with d_layers=[128, 128], dropout=0.1, d_t=64
Avg Loss: 3.4934

Training with d_layers=[128, 128], dropout=0.1, d_t=128
Avg Loss: 4.5561

Training with d_layers=[128, 128], dropout=0.1, d_t=256
Avg Loss: 2.8527

Training with d_layers=[128, 128], dropout=0.2, d_t=64
Avg Loss: 3.7634

Training with d_layers=[128, 128], dropout=0.2, d_t=128
Avg Loss: 4.5156

Training with d_layers=[128, 128], dropout=0.2, d_t=256
Avg Loss: 3.5587

Training with d_layers=[256, 256], dropout=0.0, d_t=64
Avg Loss: 2.8938

Training with d_layers=[256, 256], dropout=0.0, d_t=128
Avg Loss: 2.3748

Training with d_layers=[256, 256], dropout=0.0, d_t=256
Avg Loss: 2.6721

Training with d_layers=[256, 256], dropout=0.1, d_t=64
Avg Loss: 3.0296

Training with d_layers=[256, 256], dropout=

<All keys matched successfully>

In [255]:
avg_loss, state = train_and_eval(
    d_layers=[256,256,256],
    dropout=0.0,
    d_t=128,
    data_loader=data_loader,
    num_numerical=9,
    num_classes=[2],
    device=device,
    steps=10000,
    lr=0.0001
)
avg_loss

1.5333072059559372

In [None]:
def index_to_log_onehot(x, num_classes):
    onehots = []
    for i in range(len(num_classes)):
        onehots.append(F.one_hot(x[:, i], num_classes[i]))
 
    x_onehot = torch.cat(onehots, dim=1)
    log_onehot = torch.log(x_onehot.float().clamp(min=1e-30))
    return log_onehot

x = next(iter(data_loader))[0]
b = x.shape[0]
t, pt = diffusion.sample_time(b, device, 'uniform')

x_num = x[:, :diffusion.num_numerical_features]
x_cat = x[:, diffusion.num_numerical_features:]

x_num_t = x_num
log_x_cat_t = x_cat
if x_num.shape[1] > 0:
    noise = torch.randn_like(x_num)
    x_num_t = diffusion.gaussian_q_sample(x_num, t, noise=noise)
if x_cat.shape[1] > 0:
    log_x_cat = index_to_log_onehot(x_cat.long(), diffusion.num_classes)
    log_x_cat_t = diffusion.q_sample(log_x_start=log_x_cat, t=t)

x_in = torch.cat([x_num_t, log_x_cat_t], dim=1)

model_out = diffusion._denoise_fn(
    x_in,
    t
)

print(x)

print(model_out)


tensor([[ 0.2012, -0.3061, -0.3158,  ..., -1.1850, -1.0592,  1.0000],
        [ 0.9430,  1.2413,  1.2166,  ..., -1.3108, -1.0906,  1.0000],
        [-1.7153, -0.4511, -0.4940,  ...,  1.7087,  2.3661,  0.0000],
        ...,
        [-1.1589, -0.4350, -0.4583,  ..., -0.9333, -0.6193,  0.0000],
        [-0.7880,  3.1271,  3.1054,  ..., -0.4301, -0.7764,  1.0000],
        [-0.2934, -0.4189, -0.4940,  ..., -0.6817,  0.1664,  1.0000]])
tensor([[ 0.0989,  0.5668, -0.5592,  ..., -0.7411, -0.5338,  0.5286],
        [ 0.0085,  0.2609, -0.0081,  ...,  0.3516, -1.7337,  1.1427],
        [-0.1041, -0.1533,  0.2487,  ...,  0.0658, -0.6076,  0.5510],
        ...,
        [-1.0366, -0.3861,  0.4864,  ..., -0.6828, -0.6099,  0.3700],
        [ 0.1357,  0.5098, -0.1787,  ..., -0.4231, -1.6590,  2.2879],
        [ 0.1465, -1.6179,  0.8162,  ..., -0.4069, -0.4884,  0.5847]],
       grad_fn=<AddmmBackward0>)


In [None]:
def evaluate_mixed_loss(dm, data_loader, device):
    dm.eval()
    total_loss_multi = 0.0
    total_loss_gauss = 0.0
    total_samples = 0

    with torch.no_grad():
        for batch, in data_loader:
            x_batch = batch.to(device).float()  # adjust if your loader returns tuple, etc.
            b = x_batch.size(0)
            total_samples += b

            # empty dict if no conditions, else pass your dict
            out_dict = {}

            loss_multi, loss_gauss = dm.mixed_loss(x_batch, out_dict)

            # loss_multi and loss_gauss are mean per batch, multiply by batch size for sum
            total_loss_multi += loss_multi.item() * b
            total_loss_gauss += loss_gauss.item() * b

    return (total_loss_multi + total_loss_gauss) / total_samples


In [185]:
def random_mixed_loss(dm, x, out_dict):
    b = x.shape[0]
    device = x.device
    t, pt = dm.sample_time(b, device, 'uniform')

    x_num = x[:, :dm.num_numerical_features]
    x_cat = x[:, dm.num_numerical_features:]
    
    x_num_t = x_num
    log_x_cat_t = x_cat
    if x_num.shape[1] > 0:
        noise = torch.randn_like(x_num)
        x_num_t = dm.gaussian_q_sample(x_num, t, noise=noise)
    if x_cat.shape[1] > 0:
        log_x_cat = index_to_log_onehot(x_cat.long(), dm.num_classes)
        log_x_cat_t = dm.q_sample(log_x_start=log_x_cat, t=t)
    
    x_in = torch.cat([x_num_t, log_x_cat_t], dim=1)

    model_out = torch.randn_like(x_in)

    model_out_num = model_out[:, :dm.num_numerical_features]
    model_out_cat = model_out[:, dm.num_numerical_features:]

    loss_multi = torch.zeros((1,)).float()
    loss_gauss = torch.zeros((1,)).float()
    if x_cat.shape[1] > 0:
        loss_multi = dm._multinomial_loss(model_out_cat, log_x_cat, log_x_cat_t, t, pt, out_dict) / len(dm.num_classes)
    
    if x_num.shape[1] > 0:
        loss_gauss = dm._gaussian_loss(model_out_num, x_num, x_num_t, t, noise)

    # loss_multi = torch.where(out_dict['y'] == 1, loss_multi, 2 * loss_multi)
    # loss_gauss = torch.where(out_dict['y'] == 1, loss_gauss, 2 * loss_gauss)

    return loss_multi.mean(), loss_gauss.mean()

In [186]:
def evaluate_random_mixed_loss(dm, data_loader, device):
    dm.eval()
    total_loss_multi = 0.0
    total_loss_gauss = 0.0
    total_samples = 0

    with torch.no_grad():
        for batch, in data_loader:
            x_batch = batch.to(device).float()  # adjust if your loader returns tuple, etc.
            b = x_batch.size(0)
            total_samples += b

            # empty dict if no conditions, else pass your dict
            out_dict = {}

            loss_multi, loss_gauss = random_mixed_loss(dm, x_batch, out_dict)

            # loss_multi and loss_gauss are mean per batch, multiply by batch size for sum
            total_loss_multi += loss_multi.item() * b
            total_loss_gauss += loss_gauss.item() * b

    return (total_loss_multi + total_loss_gauss) / total_samples


In [320]:
loss= evaluate_mixed_loss(diffusion, data_loader, device)
print(f"DM dataset loss: {loss:.4f}")

loss= evaluate_random_mixed_loss(diffusion, data_loader, device)
print(f"DM dataset random loss: {loss:.4f}")


DM dataset loss: 0.7990
DM dataset random loss: 2.9847
