## SDG
The synthetic data is generated for each specified pickle object in '../pickles/settings'
that uses datasets in the folder '../data'

In [1]:
# necessary imports for the section
from sdv.tabular import CTGAN

import pickle
import pandas as pd 
import os 
import sys 

#TODO, use YML config file
sys.path.append('../src')
from utils import getPicklesFromDir

In [2]:
#TODO: use YML config file
dataset_settings  = getPicklesFromDir('../pickles/settings/')

[{'meta_data': {'name': 'D0-Diabetes',
   'id': 'D0',
   'path': '../data/diabetes.csv',
   'target': 'Outcome',
   'ordinal_features': None,
   'numeric_features': ['DiabetesPedigreeFunction',
    'BMI',
    'Insulin',
    'Glucose',
    'Age',
    'SkinThickness',
    'BloodPressure',
    'Pregnancies'],
   'text_features': None,
   'categorical_features': None},
  'setup_param': {'target': 'Outcome',
   'train_size': 0.8,
   'fold_strategy': 'stratifiedkfold',
   'fold': 10,
   'ordinal_features': None,
   'numeric_features': ['DiabetesPedigreeFunction',
    'BMI',
    'Insulin',
    'Glucose',
    'Age',
    'SkinThickness',
    'BloodPressure',
    'Pregnancies'],
   'text_features': None,
   'categorical_features': None,
   'imputation_type': 'simple',
   'numeric_imputation': 'mean',
   'categorical_imputation': 'mode',
   'iterative_imputation_iters': 10,
   'numeric_iterative_imputer': 'lightgbm',
   'categorical_iterative_imputer': 'lightgbm',
   'text_features_method': 'tf-i

In [3]:
# Specify the SDG parameters that decide synthetic data of varying quality

# TODO: further define settings, this prob is not enough
quality_params = {
    "Q1": {'epochs': 10, 'batch_size': 10},
    "Q2": {'epochs': 50, 'batch_size': 50},
#    "Q3": {'epochs': 250, 'batch_size': 500},
#    "Q4": {'epochs': 1250, 'batch_size': 1000},
} 


#TODO: move to experiment_settings

sd_size_factor = 1  # (int), factor for number of rows to generate for each setting,
                       # size * len(original_datset)
                       # i.e. 2 means double the num of samples in the original dataset

#TODO: move to experiment_settings

num_SD = 2  # decides how many synthetic datasets to generate for each setting

Psuedo code:

```
for each pickle (setting):
    for each varying quality:
        create model with sdg_param and quality
        train model with original_data
        generate num_SD synthetic datasets with:
            num_rows=SD_size_by_factor * len(original_dataset)
        
        save synthetic dataset in dataset folder
        save SDG
```

In [6]:
# Used to just experiment with SDG name
for quality in quality_params:
    for itr in range(num_SD):

        # creates the SDG name, using datset id, quality key, and itr number 
        # e.g. SD1Q1_2 means SDG trained on datset D1 with quality Q1 and copy num 2
        sdg_name = f"S{dataset_settings[0]['meta_data']['id']}{quality}_{str(itr)}"
        print(sdg_name)

SD0Q1_0
SD0Q1_1
SD0Q2_0
SD0Q2_1


Following cell runs the generation of synthetic data, then saves the SDG model and the generated data.

In [7]:
# run SDG generation
for settings in dataset_settings:
    # for each dataset specific settings
    
    # load original dataset
    original_data = pd.read_csv(settings['meta_data']['path'])  
    
    # get the size for generated synthetic data
    original_data_size = len(original_data)
    sd_size = original_data_size * sd_size_factor
    
    # loop through the different quality parameters for the SDG
    for quality in quality_params:
        
        # create num_SD SDGs and synthetic datasets for validating results
        for itr in range(num_SD):
            
            # creates model with sdg_param and quality_param as parameters
            model = CTGAN(**settings['sdg_param'], **quality_params[quality])
            model.fit(original_data)
            
            # generate synthetic data
            synthetic_data = model.sample(num_rows=sd_size)
            
            # creates SDG model name, using datset id, quality key, and itr number 
            # e.g. SD1Q1_2 means SDG trained on datset D1 with quality Q1 and copy num 2
            sdg_name = f"S{dataset_settings[0]['meta_data']['id']}{quality}_{str(itr)}"
            
            # save the synthetic dataset
            synthetic_data.to_csv(f"../data/{sdg_name}.csv")
            # saves the model using cloudpickle
            model.save(f"../pickles/SDGs/{sdg_name}.pkl")