# Synthetic Data Generation: All Models
This notebook loads train data and runs each generator, saving synthetic datasets and logging stats.

In [4]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.22.1-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.38.36-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.38.36-py3-none-any.whl.metadata (5.7 kB)
Collecting graphviz>=0.13.2 (from sdv)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting copulas>=0.12.1 (from sdv)
  Using cached copulas-0.12.2-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Using cached ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Using cached deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.17.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.21.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Using cached jmespath-1.0.1-py3-none-an

In [1]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
torch._dynamo.disable()


<torch._dynamo.eval_frame.DisableContext at 0x7efd62957ed0>

In [2]:
import sys
import os
sys.path.append(os.path.abspath("..")) 
import pandas as pd
from pathlib import Path
from src.generators.tvae_generator import TVAESynthesizerWrapper
from src.generators.ctgan_generator import CTGANSynthesizerWrapper
from src.generators.ctabgan_generator import CTABGANSynthesizerWrapper
from src.generators.great_generator import GREATSynthesizerWrapper
from src.generators.rtf_generator import RTFGeneratorWrapper
from src.utils.postprocess import match_format


In [3]:
# Load preprocessed training data
df = pd.read_csv("../data/processed/stroke_train.csv")
dataset_name = "stroke"

In [4]:
# Output log file
log_path = Path("../results/logs/synthetic_generation_log_stroke.csv")
log_path.parent.mkdir(parents=True, exist_ok=True)

In [9]:
# Define list of generators
generators = {
    #"tvae": TVAESynthesizerWrapper(output_dir="../data/synthetic/tvae"),
    #"ctgan": CTGANSynthesizerWrapper(output_dir="../data/synthetic/ctgan"),
    "ctabgan": lambda: CTABGANSynthesizerWrapper(output_dir="../data/synthetic/ctabgan", num_experiments=1),
    #"great": GREATSynthesizerWrapper(output_dir="../data/synthetic/great"),
    #"rtf": RTFGeneratorWrapper(output_dir="../data/synthetic/rtf")
    
}

In [10]:
# Defining CTABGAN config separately
ctabgan_configs = {
    "diabetes": {
        "raw_csv_path": "../data/processed/diabetes_train.csv",
        "categorical_columns": ['gender', 'hypertension', 'heart_disease', 'smoking_history', 'diabetes'],
        "log_columns": [],
        "mixed_columns": {},
        "general_columns": ['bmi', 'HbA1c_level'],
        "non_categorical_columns": [],
        "integer_columns": ['age', 'blood_glucose_level'],
        "problem_type": {"Classification": 'diabetes'}
    },
    "stroke": {
        "raw_csv_path": "../data/processed/stroke_train.csv",
        "categorical_columns": ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke'],
        "log_columns": [],
        "mixed_columns": {},
        "general_columns": ['bmi'],
        "non_categorical_columns": [],
        "integer_columns": ['age', 'avg_glucose_level'],
        "problem_type": {"Classification": 'stroke'}
    },
    "cirrhosis": {
        "raw_csv_path": "../data/processed/cirrhosis_train.csv",
        "categorical_columns": ['Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Drug', 'Status', 'Stage'],
        "log_columns": [],
        "mixed_columns": {},
        "general_columns": ['Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin'],
        "non_categorical_columns": [],
        "integer_columns": ['N_Days', 'Age'],
        "problem_type": {"Classification": 'Status'}
    }
}


In [11]:
# Initialize logs
log_rows = []

In [12]:
# Run each generator
for name, generator in generators.items():
    print(f" Generating with {name.upper()}")
    if name == "ctabgan":
        config = ctabgan_configs.get(dataset_name.lower())
        if config is None:
            print(f"No CTABGAN config defined for dataset: {dataset_name}")
            continue
        generator_instance = generator()  # initialize via lambda
        synth_data, stats = generator_instance.fit_and_generate(df, dataset_name, ctabgan_config=config)
    else:
        synth_data, stats = generator.fit_and_generate(df, dataset_name)

    # Append to log
    log_rows.append({
        "model": name,
        "execution_time_sec": stats["execution_time_sec"],
        "peak_memory_mb": stats["peak_memory_mb"],
        "n_samples": len(synth_data)
    })

 Generating with CTABGAN
Initializing CTABGAN synthesizer...
Running experiment 1/1...


100%|██████████| 150/150 [02:44<00:00,  1.10s/it]

Finished training in 166.99334025382996  seconds.
Saved: ../data/synthetic/ctabgan/stroke_ctabgan_0.csv
Training time: 167.00 seconds
Peak memory: 2.83 MB





In [20]:
# Save log to CSV
log_df = pd.DataFrame(log_rows)
log_df.to_csv(log_path, index=False)
print(f" Log saved to {log_path}")

 Log saved to ../results/logs/synthetic_generation_log.csv
