In [None]:
from sklearn.datasets import load_iris

# Load the built-in dataset
iris = load_iris(as_frame=True)
df = iris.frame
df["target"] = iris.target

# Convert to one-vs-many binary: class 0 = positive, others = negative
df["target_binary"] = (df["target"] == 0).astype(int)

# Drop target
df.drop(columns=["target"])

# Save it to CSV
df.to_csv("data/iris_setosa_vs_all.csv", index=False)

print("✅ Saved Iris dataset as iris.csv in the current directory.")
df

✅ Saved Iris dataset as iris.csv in the current directory.


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_binary
0,5.1,3.5,1.4,0.2,0,1
1,4.9,3.0,1.4,0.2,0,1
2,4.7,3.2,1.3,0.2,0,1
3,4.6,3.1,1.5,0.2,0,1
4,5.0,3.6,1.4,0.2,0,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,0
146,6.3,2.5,5.0,1.9,2,0
147,6.5,3.0,5.2,2.0,2,0
148,6.2,3.4,5.4,2.3,2,0


In [31]:
import warnings
from pathlib import Path

import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.sampling import Condition
from sdv.single_table import CopulaGANSynthesizer, CTGANSynthesizer, GaussianCopulaSynthesizer, TVAESynthesizer
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")


data_path = Path("./data")  # TODO: Change this to the path of the original data (directory)
synt_path = Path("./synthetic_data")  # TODO: Change this to the path of the synthetic data directory
data_file_original = "iris_setosa_vs_all.csv"

data = pd.read_csv(data_path / data_file_original, low_memory=False)

LABEL = "target_binary"  # TODO: Change this to the target column
ID = None  # TODO: Change this to the ID column if exists

train_data, test_data = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data[LABEL],
)


train_data.to_csv(data_path / "train.csv", index=False)  # File
test_data.to_csv(data_path / "test.csv", index=False)

In [32]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)
metadata_dict = metadata.to_dict()

train_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_binary
83,6.0,2.7,5.1,1.6,1,0
6,4.6,3.4,1.4,0.3,0,1
104,6.5,3.0,5.8,2.2,2,0
105,7.6,3.0,6.6,2.1,2,0
69,5.6,2.5,3.9,1.1,1,0
...,...,...,...,...,...,...
123,6.3,2.7,4.9,1.8,2,0
55,5.7,2.8,4.5,1.3,1,0
59,5.2,2.7,3.9,1.4,1,0
57,4.9,2.4,3.3,1.0,1,0


### Choose between binary or non-binary target

In [33]:
total_samples = train_data.shape[0]

# If the target column is not binary, you can use the following code to create a condition that converts this to a one-to-many classification problem.
# This is necessary so that we have the same label distribution in the syntehtic dataset.
category_counts = train_data[LABEL].value_counts()
print(category_counts)
target_a = int(category_counts.get(1, 0))
target_b = int(category_counts.get(0, 0))

true_condition = Condition(num_rows=target_a, column_values={LABEL: 1})
false_condition = Condition(num_rows=target_b, column_values={LABEL: 0})

# IF the target column is binary, you can use the following code to create a condition and comment the above code
# true_samples = train_data[LABEL].sum()
# false_samples = total_samples - true_samples
# true_condition = Condition(num_rows=true_samples, column_values={LABEL: True})
# false_condition = Condition(num_rows=false_samples, column_values={LABEL: False})

target_binary
0    80
1    40
Name: count, dtype: int64


### CopulaGAN

In [34]:
copula_gan_synthesizer = CopulaGANSynthesizer(metadata)
copula_gan_synthesizer.fit(train_data)
copula_gan_synthetic_data = copula_gan_synthesizer.sample_from_conditions(conditions=[true_condition, false_condition])
copula_gan_synthetic_data.to_csv(synt_path / "CopulaGAN.csv", index=False)

Sampling conditions: 100%|██████████| 120/120 [00:01<00:00, 76.84it/s]


### CTGAN

In [35]:
ctgan_synthesizer = CTGANSynthesizer(metadata)
ctgan_synthesizer.fit(train_data)
ctgan_synthetic_data = ctgan_synthesizer.sample_from_conditions(conditions=[true_condition, false_condition])
ctgan_synthetic_data.to_csv(synt_path / "CTGAN.csv", index=False)

Sampling conditions: 100%|██████████| 120/120 [00:01<00:00, 90.66it/s]


### Gaussian Copula

In [36]:
gaussian_copula_synthesizer = GaussianCopulaSynthesizer(metadata)
gaussian_copula_synthesizer.fit(train_data)
gaussian_copula_synthetic_data = gaussian_copula_synthesizer.sample_from_conditions(
    conditions=[true_condition, false_condition],
)
gaussian_copula_synthetic_data.to_csv(synt_path / "GaussianCopula.csv", index=False)

Sampling conditions: 100%|██████████| 120/120 [00:00<00:00, 1809.95it/s]


### TAVE

In [37]:
tvae_synthesizer = TVAESynthesizer(metadata)
tvae_synthesizer.fit(train_data)
tvae_synthetic_data = tvae_synthesizer.sample_from_conditions(conditions=[true_condition, false_condition])
tvae_synthetic_data.to_csv(synt_path / "TVAE.csv", index=False)

Sampling conditions: 100%|██████████| 120/120 [00:01<00:00, 67.42it/s]


### Gaussian Multivariate

In [None]:
from synthius.model import GaussianMultivariateSynthesizer  # Had to rerun this twice

gaussian_multivariate_synthesizer = GaussianMultivariateSynthesizer(train_data, synt_path)
gaussian_multivariate_synthesizer.synthesize(num_sample=total_samples)

2025-10-21 14:40:40.326829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-21 14:40:40.356550: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-21 14:40:40.356646: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-21 14:40:40.379208: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO:copulas.multivariate.gaussian:Fitting GaussianM

### WGAN

Based on the size of the data and its complexity, HP may need some changes.

This has some nasty bugs that need fixing. Maybe it works in 3.0??

In [None]:
from synthius.data import DataImputationPreprocessor
from synthius.model import WGAN

data_preprocessor = DataImputationPreprocessor(train_data)
processed_train_data = data_preprocessor.fit_transform()

n_features = processed_train_data.shape[1]
wgan_imputer = WGAN(n_features=n_features, base_nodes=128, batch_size=512, critic_iters=5, lambda_gp=10.0, num_epochs=100000)  # Num_epochs in main branch is an unexpected kwarg - removal yields a nasty bug...
wgan_imputer.train(processed_train_data, log_interval=5000, log_training=True) # Dataframe is not an iterator...

wgan_synthetic_samples = wgan_imputer.generate_samples(total_samples)
wgan_synthetic_data = pd.DataFrame(wgan_synthetic_samples, columns=processed_train_data.columns)

# --------------------- Decoding --------------------- #
decoded_wgan_synthetic_data = data_preprocessor.inverse_transform(wgan_synthetic_data)
# --------------------- Saving   --------------------- #
decoded_wgan_synthetic_data.to_csv(synt_path / "WGAN.csv", index=False)

TypeError: WGAN.__init__() got an unexpected keyword argument 'num_epochs'

### ARF

In [53]:
from synthius.model import ARF

model = ARF(x=train_data, id_column=ID, min_node_size=5, num_trees=50, max_features=0.3)
forde = model.forde()
synthetic_data_arf = model.forge(n=total_samples)

synthetic_data_arf.to_csv(synt_path / "ARF.csv", index=False)

Initial accuracy is 0.9416666666666667
Iteration number 1 reached accuracy of 0.3875.
