# Step 3: SDG

In [1]:
# necessary imports for the section
from sdv.single_table import TVAESynthesizer as TVAE
from sdv.metadata import SingleTableMetadata

import pickle
import pandas as pd 
import os 
import sys
from io import StringIO

sys.path.append('../src')
from utils import (getPicklesFromDir, 
                   getExperimentConfig, 
                   extract_loss_info_from_stdout, 
                   create_loss_plot)

# Get global experiment settings
config = getExperimentConfig()
# Get folders
folders = config['folders']
# Get dataset specific settings
dataset_settings = getPicklesFromDir(folders['settings_dir'])

In [2]:
def capture_stdout(func):
    def wrapper(*args, **kwargs):
        # Save the original stdout
        original_stdout = sys.stdout

        # Create a new StringIO object to temporarily redirect stdout
        sys.stdout = StringIO()
        
        # Call the original function and get its output
        func_output = func(*args, **kwargs)

        # Retrieve the captured stdout
        captured_stdout = sys.stdout.getvalue()
        
        # Close IO and restore the original stdout
        sys.stdout.close()
        sys.stdout = original_stdout

        # Return both the function output and the captured stdout
        return func_output, captured_stdout

    return wrapper

@capture_stdout
def train_sdg_model(model, data, sdg_name):
    print("#START#")
    print(sdg_name)
    model.fit(data)
    print("#END#")
    
    return model

In [6]:
# Specify datasets by Id, if None, all is run
run_dataset = config['run_dataset']

# get settings
quality_params = config['tvae_param']['quality_params']
sd_size_factor = config['tvae_param']['sd_size_factor']
num_SD = config['tvae_param']['num_sd']


# run SDG generation
# for each dataset specific settings
for s_index, settings in enumerate(dataset_settings):
    
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        continue
    
    metadata = SingleTableMetadata().load_from_json(settings['meta']['meta_filepath'])
    experiment_name = f"{settings['meta']['id']}-SDG"
    
    # load original dataset
    cols_dtype=None
    if 'cols_dtype' in settings['meta']:
        cols_dtyped = settings['meta']['cols_dtype']
        
    original_data = pd.read_csv(f"{folders['real_dir']}{settings['meta']['filename']}", dtype=cols_dtype)
    
    # get the size to generate the synthetic data
    original_data_size = len(original_data)
    sd_size = original_data_size * sd_size_factor
    
    logg_tags = {'Source': settings['meta']['id']}
    
    # loop through the different quality parameters for the SDG
    for quality in quality_params:
        
        display(f"Start: SDG-{settings['meta']['id']}{quality}")
        logg_tags['Quality'] = quality
        
        sdg_name = f"S{settings['meta']['id']}{quality}"
        # Get path to save the artifacts, relative to notebooks dir
        artifact_path='../data/result/SDG/'
        
        # creates model with sdg_param and quality_param as parameters
        model = TVAE(metadata=metadata, **quality_params[quality])
        
        if 'sdg_constraints' in settings['meta']:
            model.add_constraints(constraints=settings['meta']['sdg_constraints'])
        
        print("#START#")
        print(sdg_name)
        model.fit(original_data)

        #model, stdout_loss = train_sdg_model(model, original_data, sdg_name)
        # extract loss, create loss plot and save it
        print("#END#")
        
        #loss_dict = extract_loss_info_from_stdout(stdout_loss)
        #create and save loss plot
        #fig = create_loss_plot(sdg_name, loss_dict[sdg_name])
        #fig_path = f"{artifact_path}/{sdg_name}_loss.png"
        #fig.savefig(fig_path)
        #save loss data
        #loss_df_path = f"{artifact_path}/{sdg_name}_loss.csv"
        #loss_dict[sdg_name].to_csv(loss_df_path, index=False)        
        # saves the SDG model using cloudpickle
        
        model_path = f"{folders['SDGs_dir']}/{sdg_name}_TVAE.pkl"
        model.save(model_path)
        
        # create num_SD SDGs and synthetic datasets for validating results
        for itr in range(num_SD):
            
            # creates Synthetic dataset name, using datset id, quality key, and itr number 
            # e.g. SD1Q1_2 means SDG trained on datset D1 with quality Q1 and copy num 2
            SD_name = f"S{settings['meta']['id']}{quality}_{str(itr)}"
            
            # relative file path for the synthetic dataset
            sd_path = f"{folders['sd_dir']}{SD_name}_TVAE.csv"
            
            # generate synthetic data
            synthetic_data = model.sample(num_rows=sd_size)
            
            # save the synthetic dataset
            synthetic_data.to_csv(sd_path, index=False)

'Start: SDG-D305Q1000'

#START#
SD305Q1000
#END#


Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:16<00:00, 45.82it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:16<00:00, 45.99it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:16<00:00, 45.51it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:18<00:00, 44.60it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:15<00:00, 46.28it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:14<00:00, 47.17it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:17<00:00, 45.15it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:18<00:00, 44.60it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:14<00:00, 47.02it/s]
Sampling rows: 100%|███████████████████████████████████| 3500/3500 [01:14<00:00, 47.22it/s]


---