In [290]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import tomli
import toml
import subprocess
import json

In [291]:
dataset = "bank_latent"

In [292]:
# This csv file will be the entry point from the variational auto encoder part
data = pd.read_csv("../../data/processed/bank.csv")
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [293]:
idx = np.arange(0, X.shape[0])
train_idx, test_idx = train_test_split(idx, test_size =0.3, random_state=42)
val_idx, test_idx,  = train_test_split(test_idx, test_size =0.5, random_state=42)
X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
y_train, y_val, y_test = y[train_idx], y[val_idx], y[test_idx]

In [294]:
train_size, valu_size, test_size = X_train.shape[0], X_val.shape[0], X_test.shape[0]
print(train_size, valu_size, test_size)

3500 750 750


In [295]:
np.save(f'../data/{dataset}/X_num_train.npy', X_train)
np.save(f'../data/{dataset}/X_num_val.npy', X_val)
np.save(f'../data/{dataset}/X_num_test.npy', X_test)

In [272]:
np.save(f'../data/{dataset}/y_train.npy', y_train)
np.save(f'../data/{dataset}/y_val.npy', y_val)
np.save(f'../data/{dataset}/y_test.npy', y_test)

In [273]:
num_feature_size, cat_features_size = 14, 0 # These are default considering latent size of each client is 3 in VAE

In [274]:
info_file = {
    "task_type": "binclass",
    "name": f"{dataset}",
    "id": f"{dataset}--id",
    "train_size": train_size,
    "val_size": valu_size,
    "test_size": test_size,
    "n_num_features": num_feature_size,
    "n_cat_features": cat_features_size
}

In [275]:
os.makedirs(f"../data/{dataset}", exist_ok=True)

In [276]:
with open(f"../data/{dataset}/info.json", "w") as info_writer:
    json.dump(info_file, info_writer)

In [277]:
os.makedirs(f"../exp/{dataset}", exist_ok=True)

In [278]:
config_file = {
    'seed': 0,
    'parent_dir': f'../exp/{dataset}/',
    'real_data_path': f'../data/{dataset}/',
    'model_type': 'mlp',
    'num_numerical_features': num_feature_size,   # Set the number of numerical features here
    'device': 'cpu',  
    'model_params': {       # Change the denoising architecture here as per your liking
        'd_in': 15, #ASK What is this.
        'num_classes': 2,
        'is_y_cond': True,
        'rtdl_params': {
            'd_layers': [
                128,
                512
            ],
            'dropout': 0.0
        }
    },
    'diffusion_params': {
        'num_timesteps': 1000,
        'gaussian_loss_type': 'mse'
    },
    'train': {
        'main': {
            'steps': 30000,
            'lr': 1.1510940031144828e-05,
            'weight_decay': 0.0,
            'batch_size': 4096
        },
        'T': {
            'seed': 0,
            'normalization': 'quantile',
            'num_nan_policy': '__none__',
            'cat_nan_policy': '__none__',
            'cat_min_frequency': '__none__',
            'cat_encoding': '__none__',
            'y_policy': 'default'
        }
    },
    'sample': {
        'num_samples': 5000,
        'batch_size': 500,
        'seed': 0
    },
    'eval': {
        'type': {
            'eval_model': 'catboost',
            'eval_type': 'synthetic'
        },
        'T': {
            'seed': 0,
            'normalization': '__none__',
            'num_nan_policy': '__none__',
            'cat_nan_policy': '__none__',
            'cat_min_frequency': '__none__',
            'cat_encoding': '__none__',
            'y_policy': 'default'
        }
    }
}

In [279]:
with open(f"../exp/{dataset}/config.toml", 'w') as toml_file:
    toml.dump(config_file, toml_file)

In [287]:
command = [
    "python", 
    "pipeline.py", 
    "--config", f"../exp/{dataset}/config.toml", 
    "--train", 
    "--sample"
]
result = subprocess.run(command, capture_output=True, text=True)

In [288]:
result.stdout

"[0]\n13\n{'d_in': 13, 'num_classes': 2, 'is_y_cond': True, 'rtdl_params': {'d_layers': [128, 512], 'dropout': 0.0}}\nmlp\n******* Number of class: ********* 2 15\n"

In [289]:
result.stderr

'Traceback (most recent call last):\n  File "/Users/arnobchowdhury/Documents/university_work/Thesis/Silofuse/central_backbone/scripts/pipeline.py", line 112, in <module>\n    main()\n  File "/Users/arnobchowdhury/Documents/university_work/Thesis/Silofuse/central_backbone/scripts/pipeline.py", line 48, in main\n    train(\n  File "/Users/arnobchowdhury/Documents/university_work/Thesis/Silofuse/central_backbone/scripts/train.py", line 153, in train\n    trainer.run_loop()\n  File "/Users/arnobchowdhury/Documents/university_work/Thesis/Silofuse/central_backbone/scripts/train.py", line 55, in run_loop\n    batch_loss_multi, batch_loss_gauss = self._run_step(x, out_dict)\n  File "/Users/arnobchowdhury/Documents/university_work/Thesis/Silofuse/central_backbone/scripts/train.py", line 39, in _run_step\n    loss_multi, loss_gauss = self.diffusion.mixed_loss(x, out_dict)\n  File "/Users/arnobchowdhury/Documents/university_work/Thesis/Silofuse/central_backbone/scripts/tab_ddpm/gaussian_multinomi

In [111]:
data = np.load('../data/bank_latent/X_num_train.npy')
data
large_values = data >= 1000000000.0
print("Positions of large values:")
print(np.where(large_values))

# Optionally, print the actual values
print("Large values found:")
print(data[large_values])

Positions of large values:
(array([], dtype=int64), array([], dtype=int64))
Large values found:
[]


NameError: name 'np' is not defined