In [11]:
import pandas as pd
import os
from sdv.single_table import CopulaGANSynthesizer, CTGANSynthesizer, GaussianCopulaSynthesizer, TVAESynthesizer
from sdv.metadata import Metadata

In [12]:
# Load the real data
real_data = pd.read_csv('../datasets/ben10_master.csv')

In [13]:
# Load or create metadata
metadata_path = 'metadata.json'
if os.path.exists(metadata_path):
    metadata = Metadata.load_from_json(metadata_path)
    print("Loaded metadata from metadata.json")
else:
    metadata = Metadata()
    metadata.detect_table_from_dataframe(
        table_name='ben10_table',
        data=real_data
    )
    metadata.save_to_json(metadata_path)
    print("Created and saved new metadata to metadata.json")

Loaded metadata from metadata.json


In [14]:
synthesizers = [
    {
        # Uses a mix classic, statistical methods and GAN-based deep learning methods to train a model and generate synthetic data.
        'name': 'CopulaGAN',
        'class': CopulaGANSynthesizer,
        'params': {
            'enforce_min_max_values': True,
            'enforce_rounding': False,
            'epochs': 2000,
            'verbose': True,
            'cuda': True
        },
        'output': '../datasets/ben10_CopulaGAN_synthetic.csv'
    },
    {
        # Uses GAN-based, deep learning methods to train a model and generate synthetic data.
        'name': 'CTGAN',
        'class': CTGANSynthesizer,
        'params': {
            'enforce_min_max_values': True,
            'enforce_rounding': False,
            'epochs': 4000,
            'verbose': True,
            'cuda': True
        },
        'output': '../datasets/ben10_CTGAN_synthetic.csv'
    },
    {
        # Uses classic, statistical methods to train a model and generate synthetic data.
        'name': 'GaussianCopula',
        'class': GaussianCopulaSynthesizer,
        'params': {
            'enforce_min_max_values': True,
            'enforce_rounding': False,
        },
        'output': '../datasets/ben10_GaussianCopula_synthetic.csv'
    },
    {
        # Uses a variational autoencoder (VAE)-based, neural network techniques to train a model and generate synthetic data.
        'name': 'TVAE',
        'class': TVAESynthesizer,
        'params': {
            'enforce_min_max_values': True,
            'enforce_rounding': False,
            'epochs': 2000,
            'verbose': True,
            'cuda': True
        },
        'output': '../datasets/ben10_TVAE_synthetic.csv'
    }
]

In [15]:
# Fit synthesizers and generate synthetic data
for synth in synthesizers:
    print(f"\nProcessing {synth['name']}...")
    SynthClass = synth['class']
    if synth['name'] == 'GaussianCopula':
        synthesizer = SynthClass(metadata)
    else:
        synthesizer = SynthClass(metadata, **synth['params'])
    synthesizer.fit(real_data)
    synthetic_data = synthesizer.sample(num_rows=10000)
    synthetic_data.to_csv(synth['output'], index=False)
    print(f"Synthetic dataset saved to {synth['output']}")


Processing CopulaGAN...


Gen. (-0.63) | Discrim. (-0.89): 100%|██████████| 2000/2000 [02:35<00:00, 12.86it/s]


Synthetic dataset saved to ../datasets/ben10_CopulaGAN_synthetic.csv

Processing CTGAN...


Gen. (-0.60) | Discrim. (-0.49): 100%|██████████| 4000/4000 [05:40<00:00, 11.74it/s]


Synthetic dataset saved to ../datasets/ben10_CTGAN_synthetic.csv

Processing GaussianCopula...
Synthetic dataset saved to ../datasets/ben10_GaussianCopula_synthetic.csv

Processing TVAE...


Loss: 10.599: 100%|██████████| 2000/2000 [00:26<00:00, 76.32it/s]


Synthetic dataset saved to ../datasets/ben10_TVAE_synthetic.csv
