In [1]:
import pandas as pd
from scipy.io import arff
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import sys
import os
sys.path.append(os.path.abspath('../'))


In [2]:
file_path = 'data/raw/phpOJxGL9.arff'

data, meta = arff.loadarff(file_path)
df = pd.DataFrame(data)

df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,Class
0,65.0,Female,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1
1,62.0,Male,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,62.0,Male,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,58.0,Male,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1
4,72.0,Male,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1


In [3]:
df_train = df[['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V2']]

In [4]:
def prepare_dataset(df, categorical_cols=None, test_size=0.2):
    df = df.copy()
    
    # Detect numeric and categorical
    if categorical_cols is None:
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = df.columns.difference(categorical_cols).tolist()

    # Encode categorical
    encoder = OrdinalEncoder()
    df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

    # Scale numerical
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Convert to torch
    X = torch.tensor(df.values, dtype=torch.float32)

    X_train, X_val = train_test_split(X, test_size=test_size, random_state=42)
    train_loader = DataLoader(TensorDataset(X_train), batch_size=256, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val), batch_size=256)

    return train_loader, val_loader, len(numerical_cols), [int(df[col].nunique()) for col in categorical_cols]


In [None]:
from torch import nn
import torch.nn.functional as F
from src.tabddpm.modules import timestep_embedding, MLP

class MLPDiffusion(nn.Module):
    """
    A simplified MLP-based diffusion model for tabular data.

    This model uses timestep embeddings and a projection layer
    to inject time information into the input before processing it
    through an MLP. It is intended for unsupervised tasks like clustering,
    where label conditioning is not required.

    Args:
        d_in (int): Input feature dimension.
        rtdl_params (dict): Parameters for the RTDL MLP (e.g., hidden sizes, depth).
        dim_t (int): Dimensionality for the timestep embedding and projection space.
    """
    def __init__(self, d_in, rtdl_params, dim_t=128):
        super().__init__()
        self.dim_t = dim_t

        # Configure MLP: input will be timestep-embedded, output must match original input size
        rtdl_params['d_in'] = dim_t
        rtdl_params['d_out'] = d_in
        self.mlp = MLP.make_baseline(**rtdl_params)

        # Project input features into timestep embedding space
        self.proj = nn.Linear(d_in, dim_t)

        # Timestep embedding network (2-layer MLP with SiLU activation)
        self.time_embed = nn.Sequential(
            nn.Linear(dim_t, dim_t),
            nn.SiLU(),
            nn.Linear(dim_t, dim_t)
        )

    def forward(self, x, timesteps):
        """
        Forward pass of the MLPDiffusion model.

        Args:
            x (Tensor): Input tensor of shape (batch_size, d_in).
            timesteps (Tensor): Timestep tensor of shape (batch_size,) representing the diffusion step.

        Returns:
            Tensor: Output tensor of shape (batch_size, d_in), same as the input dimension.
        """
        # Get timestep embedding (e.g., sinusoidal), then pass through a small MLP
        emb = self.time_embed(timestep_embedding(timesteps, self.dim_t))

        # Project input to timestep embedding space and add time information
        x = self.proj(x) + emb

        # Pass through MLP and return
        return self.mlp(x)


In [6]:
from src.tabddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion

def get_diffusion_model(model, num_classes, num_numerical, device):
    diffusion = GaussianMultinomialDiffusion(
        num_classes=np.array(num_classes),
        num_numerical_features=num_numerical,
        denoise_fn=model,
        gaussian_loss_type='mse',
        num_timesteps=1000,
        scheduler='cosine',
        device=device
    )
    diffusion.to(device)
    return diffusion


In [None]:
from copy import deepcopy
import torch.optim as optim

class Trainer:
    def __init__(self, model, train_loader, steps=1000, lr=1e-3, device='cuda'):
        self.model = model
        self.train_loader = train_loader
        self.steps = steps
        self.device = device
        self.optimizer = optim.AdamW(model.parameters(), lr=lr)
        self.ema_model = deepcopy(model._denoise_fn)
        for p in self.ema_model.parameters():
            p.requires_grad = False

    def train(self):
        step = 0
        iterator = iter(self.train_loader)

        while step < self.steps:
            try:
                x_batch, = next(iterator)
            except StopIteration:
                iterator = iter(self.train_loader)
                x_batch, = next(iterator)

            x_batch = x_batch.to(self.device)

            self.optimizer.zero_grad()
            loss_multi, loss_gauss = self.model.mixed_loss(x_batch, {})
            loss = loss_multi + loss_gauss
            loss.backward()
            self.optimizer.step()

            if step % 100 == 0:
                print(f"[{step}/{self.steps}] Loss: {loss.item():.4f}")

            step += 1


In [8]:
import torch

# Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader, _, num_numerical, num_classes = prepare_dataset(df_train, ['V2'])

model_params = {
    'num_classes': 0,
    'is_y_cond': False,
    'rtdl_params': {
        'd_layers': [256, 256, 256],
        'dropout': 0.1
    }
}

mlp_model = MLPDiffusion(
    d_in=df.shape[1],
    num_classes=model_params['num_classes'],
    is_y_cond=model_params['is_y_cond'],
    rtdl_params=model_params['rtdl_params']
)

mlp_model.to(device)

diffusion = get_diffusion_model(mlp_model, num_classes, num_numerical, device)

trainer = Trainer(diffusion, train_loader, steps=1000, device=device)
trainer.train()


[0/1000] Loss: 2.1671
[100/1000] Loss: 2.4032
[200/1000] Loss: 1.2397
[300/1000] Loss: 1.3275
[400/1000] Loss: 0.8968
[500/1000] Loss: 0.9164
[600/1000] Loss: 0.8772
[700/1000] Loss: 0.9224
[800/1000] Loss: 1.0575
[900/1000] Loss: 0.7988


In [None]:
mlp_model(torch)

In [43]:
diffusion.sample_time(2, device, 'uniform', **{'y':2})

TypeError: GaussianMultinomialDiffusion.sample_time() got an unexpected keyword argument 'y'

In [9]:
# 1. Select an input sample x
x_orig = next(iter(train_loader))[0].to(device)[:1]  # single sample

# 2. Pick a timestep (e.g., t = 100)
t = torch.tensor([100], device=device)

# 3. Add noise to x_orig using the diffusion's q_sample
x_noisy = diffusion.q_sample(x_orig, t)

# 4. Reconstruct using the model
recon = diffusion.p_sample(x_noisy, t, y=None)


RuntimeError: The size of tensor a (10) must match the size of tensor b (2) at non-singleton dimension 1

In [None]:
x_orig

tensor([[-0.6025,  3.7074,  3.7825, -0.7522,  0.4784,  1.8017,  2.5051, -1.4366,
         -2.0334,  1.0000]])