# Synthetic Data Generation: All Models
This notebook loads train data and runs each generator, saving synthetic datasets and logging stats.

In [4]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.22.1-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.38.36-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.38.36-py3-none-any.whl.metadata (5.7 kB)
Collecting graphviz>=0.13.2 (from sdv)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting copulas>=0.12.1 (from sdv)
  Using cached copulas-0.12.2-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Using cached ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Using cached deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.17.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.21.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Using cached jmespath-1.0.1-py3-none-an

In [None]:
import sys
import os
sys.path.append(os.path.abspath("..")) 
import pandas as pd
from pathlib import Path
from src.generators.tvae_generator import TVAESynthesizerWrapper
from src.generators.ctgan_generator import CTGANSynthesizerWrapper
from src.generators.ctabgan_generator import CTABGANSynthesizerWrapper
from src.generators.great_generator import GREATSynthesizerWrapper
from src.generators.rtf_generator import RTFGeneratorWrapper


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f277ff05f50>>
Traceback (most recent call last):
  File "/home/jovyan/paper-realtab/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [13]:
# Load preprocessed training data
df = pd.read_csv("../data/processed/cirrhosis_train.csv")
dataset_name = "cirrhosis"

In [14]:
# Output log file
log_path = Path("../results/logs/synthetic_generation_log.csv")
log_path.parent.mkdir(parents=True, exist_ok=True)

In [16]:
# Define list of generators
generators = {
    "tvae": TVAESynthesizerWrapper(output_dir="../data/synthetic/tvae"),
    "ctgan": CTGANSynthesizerWrapper(output_dir="../data/synthetic/ctgan"),
    "ctabgan": lambda: CTABGANSynthesizerWrapper(output_dir="../data/synthetic/ctabgan", num_experiments=1),
    "great": GREATSynthesizerWrapper(output_dir="../data/synthetic/great"),
    "rtf": RTFGeneratorWrapper(output_dir="../data/synthetic/rtf")
    
}

In [17]:
# Defining CTABGAN config separately
ctabgan_configs = {
    "diabetes": {
        "raw_csv_path": "../data/processed/diabetes_train.csv",
        "categorical_columns": ['gender', 'hypertension', 'heart_disease', 'smoking_history', 'diabetes'],
        "log_columns": [],
        "mixed_columns": {},
        "general_columns": ['bmi', 'HbA1c_level'],
        "non_categorical_columns": [],
        "integer_columns": ['age', 'blood_glucose_level'],
        "problem_type": {"Classification": 'diabetes'}
    },
    "stroke": {
        "raw_csv_path": "../data/processed/stroke_train.csv",
        "categorical_columns": ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke'],
        "log_columns": [],
        "mixed_columns": {},
        "general_columns": ['bmi'],
        "non_categorical_columns": [],
        "integer_columns": ['age', 'avg_glucose_level'],
        "problem_type": {"Classification": 'stroke'}
    },
    "cirrhosis": {
        "raw_csv_path": "../data/processed/cirrhosis_train.csv",
        "categorical_columns": ['Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Drug', 'Status', 'Stage'],
        "log_columns": [],
        "mixed_columns": {},
        "general_columns": ['Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin'],
        "non_categorical_columns": [],
        "integer_columns": ['N_Days', 'Age'],
        "problem_type": {"Classification": 'Status'}
    }
}


In [18]:
# Initialize logs
log_rows = []

In [19]:
# Run each generator
for name, generator in generators.items():
    print(f" Generating with {name.upper()}")
    if name == "ctabgan":
        config = ctabgan_configs.get(dataset_name.lower())
        if config is None:
            print(f"No CTABGAN config defined for dataset: {dataset_name}")
            continue
        generator_instance = generator()  # initialize via lambda
        synth_data, stats = generator_instance.fit_and_generate(df, dataset_name="cirrhosis", ctabgan_config=config)
    else:
        synth_data, stats = generator.fit_and_generate(df, dataset_name="cirrhosis")

    # Append to log
    log_rows.append({
        "model": name,
        "execution_time_sec": stats["execution_time_sec"],
        "peak_memory_mb": stats["peak_memory_mb"],
        "n_samples": len(synth_data)
    })

 Generating with TVAE
Detecting metadata from input dataframe...
 Synthetic data saved to: ../data/synthetic/tvae/cirrhosis_tvae.csv
 Execution Time: 7.86 seconds
 Peak Memory Usage: 1.18 MB
 Generating with CTGAN
Detecting metadata from input dataframe...
Initializing CTGAN synthesizer...
Starting model training...
Training complete.
Generating synthetic data...
Synthetic data saved to: ../data/synthetic/ctgan/cirrhosis_ctgan.csv
Execution time: 13.96 seconds
Peak memory usage: 1.21 MB
 Generating with CTABGAN
Initializing CTABGAN synthesizer...
Running experiment 1/1...


100%|██████████| 150/150 [00:32<00:00,  4.55it/s]


Finished training in 33.427955865859985  seconds.
Saved: ../data/synthetic/ctabgan/cirrhosis_ctabgan_0.csv
Training time: 33.43 seconds
Peak memory: 0.67 MB
 Generating with GREAT
Initializing GReaT synthesizer...
Starting training...


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


Attempting guided sampling...


100%|██████████| 220/220 [07:19<00:00,  2.00s/it]

Saved synthetic data to: ../data/synthetic/great/cirrhosis_great.csv
Execution Time: 158.22 seconds
Peak Memory Usage: 313.21 MB
 Generating with RTF
Initializing REaLTabFormer model...
Starting training...
Computing the sensitivity threshold...
Using parallel computation!!!





Bootstrap round:   0%|          | 0/10 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    10.000000
mean     -0.001944
std       0.015552
min      -0.013889
25%      -0.012500
50%      -0.004167
75%      -0.000694
max       0.038889
dtype: float64
Sensitivity threshold: 0.02263888888888885 qt_max: 0.05


Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss


  0%|          | 0/108 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Critic round: 5,                     sensitivity_threshold: 0.02263888888888885,                         val_sensitivity: 0.002592592592592591,                             val_sensitivities: [-0.005555555555555557, -0.0027777777777777783, -0.019444444444444445, -0.0027777777777777783, 0.011111111111111106, 0.03611111111111111, -0.008333333333333335, 0.005555555555555553, 0.013888888888888888, 0.0027777777777777766, -0.0027777777777777783, -0.019444444444444445, -0.019444444444444445, 0.03888888888888889, 0.011111111111111106]


Step,Training Loss


  0%|          | 0/108 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Critic round: 10,                     sensitivity_threshold: 0.02263888888888885,                         val_sensitivity: 0.015555555555555555,                             val_sensitivities: [0.02222222222222222, -0.0055555555555555575, -0.008333333333333335, 0.030555555555555558, 0.01111111111111111, 0.027777777777777776, -0.01388888888888889, -0.0027777777777777783, 0.044444444444444446, 0.005555555555555553, 0.01111111111111111, 0.016666666666666663, 0.008333333333333333, 0.07222222222222222, 0.013888888888888888]


Step,Training Loss


  0%|          | 0/108 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Critic round: 15,                     sensitivity_threshold: 0.02263888888888885,                         val_sensitivity: 0.022222222222222223,                             val_sensitivities: [0.019444444444444445, 0.013888888888888886, -0.008333333333333335, 0.025, 0.016666666666666663, 0.025, 0.013888888888888888, 0.041666666666666664, 0.058333333333333334, 0.03333333333333333, -0.008333333333333335, -0.002777777777777779, 0.008333333333333333, 0.06944444444444445, 0.027777777777777776]


Step,Training Loss


  0%|          | 0/108 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 20,                     sensitivity_threshold: 0.02263888888888885,                         val_sensitivity: 0.012592592592592591,                             val_sensitivities: [0.002777777777777775, -0.008333333333333335, -0.01388888888888889, 0.02222222222222222, 0.013888888888888888, 0.047222222222222214, 0.024999999999999998, 0.02222222222222222, 0.02222222222222222, -0.008333333333333335, 0.0027777777777777775, 0.030555555555555558, -0.008333333333333335, 0.03333333333333333, 0.005555555555555553]
Generating synthetic data...


  0%|          | 0/220 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%
Saved synthetic data to: ../data/synthetic/rtf/cirrhosis_rtf.csv
Execution Time: 37.53 seconds
Peak Memory Usage: 167.52 MB


In [20]:
# Save log to CSV
log_df = pd.DataFrame(log_rows)
log_df.to_csv(log_path, index=False)
print(f" Log saved to {log_path}")

 Log saved to ../results/logs/synthetic_generation_log.csv


In [26]:
import os
os.getcwd()


'/home/jovyan/DataGen/Github/notebooks'

In [None]:
import shutil

# Folder you want to download
folder_to_zip = '/home/jovyan/DataGen/Github'  # change this to your folder name
output_filename = 'github'  # this will create my_folder.zip

# Create the zip file
shutil.make_archive(output_filename, 'zip', folder_to_zip)
print(f"Zipped {folder_to_zip} as {output_filename}.zip")
