# Step 1: Data preperation, D3-minst

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from ydata_profiling import ProfileReport

import os
import sys
import yaml

# Display all available datsets
for dirname, _, filenames in os.walk('../data/raw/'):
    for filename in filenames:
        display(os.path.join(dirname, filename))

'../data/raw/.gitkeep'

'../data/raw/bank-full.csv'

'../data/raw/bank-names.txt'

'../data/raw/bank.csv'

'../data/raw/bank.zip'

'../data/raw/dataset_us_diabetes.zip'

'../data/raw/diabetes.csv'

'../data/raw/health-insurance.zip'

'../data/raw/mnist.zip'

'../data/raw/titanic.csv'

'../data/raw/mnist\\mnist_test.csv'

'../data/raw/mnist\\mnist_train.csv'

In [2]:
sys.path.append('../src')
from utils import getExperimentConfig

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']

In this section the data will be examined for selecting the preprocessing and model of the original dataset. This pipeline of preprocessing will then be save for executing on the respective synthetic dataset.


This section will be done independently for each dataset that will be explored, with the hopes that rest of the steps of the experiment can be automized.

In [3]:
data_filename = "mnist.csv"
data_id = "D3"
data_name = "mnist"

train_data_fp = f"{folders['raw_dir']}{data_name}/{data_name}_train.csv"
test_data_fp = f"{folders['raw_dir']}{data_name}/{data_name}_test.csv"
train_data = pd.read_csv(train_data_fp)

# get column names, to convert to uint8, all values are between 0-255
columns = train_data.columns.tolist()
cols_dtype = {col: 'UInt8' for col in columns}
cols_dtype['label'] = 'UInt8'

# load data and concat to a single table
train_data = pd.read_csv(train_data_fp, dtype=cols_dtype)
test_data =  pd.read_csv(test_data_fp, dtype=cols_dtype)
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
#pd.set_option('display.max_columns', None)
display(data.head())
display(data.info(verbose=False, memory_usage="deep"))

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Columns: 785 entries, label to 28x28
dtypes: UInt8(785)
memory usage: 104.8 MB


None

In [4]:
data.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,70000.0,4.45,2.89,0.0,2.0,4.0,7.0,9.0
1x1,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1x2,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1x3,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1x4,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
28x24,70000.0,0.0,0.32,0.0,0.0,0.0,0.0,62.0
28x25,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28x26,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28x27,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#from pycaret.classification import *

# run eda
#s = setup(data, target='label', verbose=True)
#eda()

In [6]:
#ProfileReport(data, minimal=True, explorative=True)

In [7]:
print("=== Null values: ===\n")
display(data.isnull().sum())
print("\n=== Data types: === \n")
display(data.info(verbose=False, memory_usage="deep"))

=== Null values: ===



label    0
1x1      0
1x2      0
1x3      0
1x4      0
        ..
28x24    0
28x25    0
28x26    0
28x27    0
28x28    0
Length: 785, dtype: int64


=== Data types: === 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Columns: 785 entries, label to 28x28
dtypes: UInt8(785)
memory usage: 104.8 MB


None

### Define metadata for the dataset
The following cells in this section is for defining the dataset specific settings that are needed to run the following experiment.

> NOTICE:
*The meta dictionary gets updated in Step 3: SDG, where metadata about each synthetic data that is generated on the respective real data. Data is appended to 'sd_meta_list' key.
This is then saved over the current settings.*

In [8]:
# metadata for the SDG
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

In [9]:
# Display metadata & validate
display(metadata)

display(metadata.validate())


{
    "columns": {
        "label": {
            "sdtype": "categorical"
        },
        "1x1": {
            "sdtype": "categorical"
        },
        "1x2": {
            "sdtype": "categorical"
        },
        "1x3": {
            "sdtype": "categorical"
        },
        "1x4": {
            "sdtype": "categorical"
        },
        "1x5": {
            "sdtype": "categorical"
        },
        "1x6": {
            "sdtype": "categorical"
        },
        "1x7": {
            "sdtype": "categorical"
        },
        "1x8": {
            "sdtype": "categorical"
        },
        "1x9": {
            "sdtype": "categorical"
        },
        "1x10": {
            "sdtype": "categorical"
        },
        "1x11": {
            "sdtype": "categorical"
        },
        "1x12": {
            "sdtype": "categorical"
        },
        "1x13": {
            "sdtype": "categorical"
        },
        "1x14": {
            "sdtype": "categorical"
        },
        "1x15"

None

In [10]:
for col in cols_dtype:
    metadata.update_column(
    column_name=col,
    sdtype='numerical',
    computer_representation= 'UInt8')

metadata.update_column(
column_name='label',
sdtype='categorical')    
    
metadata.validate()
display(metadata)

{
    "columns": {
        "label": {
            "sdtype": "categorical"
        },
        "1x1": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x2": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x3": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x4": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x5": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x6": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x7": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x8": {
            "sdtype": "numerical",
            "computer_representation": "UInt8"
        },
        "1x9": {
            "sdtyp

In [11]:
columns[1:]

['1x1',
 '1x2',
 '1x3',
 '1x4',
 '1x5',
 '1x6',
 '1x7',
 '1x8',
 '1x9',
 '1x10',
 '1x11',
 '1x12',
 '1x13',
 '1x14',
 '1x15',
 '1x16',
 '1x17',
 '1x18',
 '1x19',
 '1x20',
 '1x21',
 '1x22',
 '1x23',
 '1x24',
 '1x25',
 '1x26',
 '1x27',
 '1x28',
 '2x1',
 '2x2',
 '2x3',
 '2x4',
 '2x5',
 '2x6',
 '2x7',
 '2x8',
 '2x9',
 '2x10',
 '2x11',
 '2x12',
 '2x13',
 '2x14',
 '2x15',
 '2x16',
 '2x17',
 '2x18',
 '2x19',
 '2x20',
 '2x21',
 '2x22',
 '2x23',
 '2x24',
 '2x25',
 '2x26',
 '2x27',
 '2x28',
 '3x1',
 '3x2',
 '3x3',
 '3x4',
 '3x5',
 '3x6',
 '3x7',
 '3x8',
 '3x9',
 '3x10',
 '3x11',
 '3x12',
 '3x13',
 '3x14',
 '3x15',
 '3x16',
 '3x17',
 '3x18',
 '3x19',
 '3x20',
 '3x21',
 '3x22',
 '3x23',
 '3x24',
 '3x25',
 '3x26',
 '3x27',
 '3x28',
 '4x1',
 '4x2',
 '4x3',
 '4x4',
 '4x5',
 '4x6',
 '4x7',
 '4x8',
 '4x9',
 '4x10',
 '4x11',
 '4x12',
 '4x13',
 '4x14',
 '4x15',
 '4x16',
 '4x17',
 '4x18',
 '4x19',
 '4x20',
 '4x21',
 '4x22',
 '4x23',
 '4x24',
 '4x25',
 '4x26',
 '4x27',
 '4x28',
 '5x1',
 '5x2',
 '5x3',
 '5x

In [12]:
########## Define dataset id and save metadata
meta_filepath = f"{folders['meta_dir']}{data_id}"

try:
    metadata.save_to_json(meta_filepath)

except:
    print(f"File {meta_filepath} already exits and has been replaced.")
    os.remove(meta_filepath)
    metadata.save_to_json(meta_filepath)

File ../data/metadata/D3 already exits and has been replaced.


In [13]:
# Define dataset meta data for the setup parameters in pycaret
# use this to avoid needing to save the whole dataset in a pickle object

# use the parameters to read the data from csv into the setup, e.g.
meta = {
    # Generall
    'name':     data_name,
    'id':       data_id,
    'filename': f"{data_id}-{data_filename}",
    
    'cols_dtype': cols_dtype,  # datatypes for reading from csv to pd.dataframe to save memory
    
    # Pycaret
    'target': 'label',
    
    'categorical_features': None,
    'ordinal_features': None,
    
    'numeric_features': columns[1:],
    'text_features': None,

    'meta_filepath': meta_filepath,
    
}

> Note on Iterative imputation that exists in pycaret:
*Iterative imputation is a imputation method that for each feature, sets up a model to predict the missing values with the rest of the features as predictors, then repeatedly does this for each feature with missing values.*

### Define setup parameters for pycaret
Use these settings to instruct for pycaret how to preprocess the data, handle the model training and evaluation. Basically the ML pipeline.

In [14]:
# Define the setup parameters for pycaret setup function, where the details of preprocessing is defined
# Note: can only contain keywords that exists in the settings of the pycaret.setup()

setup_param = {
    'target': meta['target'],

    ### Sampling settings ###
    'train_size': 0.8,  # (float) default=0.7, the train test split
    # used for training and validation
    'fold_strategy': 'stratifiedkfold',  # (srt), default = 'stratifiedkfold',
    'data_split_stratify': True,
    # selects cross-validation method
    'fold': config['clf']['cv_folds'],  # (int) default=10, the number of folds

    ### Data-preparation settings ###

    #### Define features (use meta) ####
    'ordinal_features': meta['ordinal_features'],
    'numeric_features': meta['numeric_features'],
    'text_features': meta['text_features'],
    'categorical_features': meta['categorical_features'],

    #### Imputation methods #### 
    #Note: imputation will be performed in step 1, instead of in pycaret
    'imputation_type': None,  # ('simple', 'iterative', None) default='simple'
    'numeric_imputation': 'mean',  # (int, float or str) default='mean',
                        # it's ignored if imputation_type='iterative'
                        # alternatives:
                        #   'drop'      : drops rows with missing values
                        #   'mean'      : replace with mean of column
                        #   'median'    : replace with median of column
                        #   'mode'      : replace with mode of column
                        #   'knn'       : replace with KNN approach
                        #   int or float: replace with provided value
    'categorical_imputation': 'mode',  # same as numeric, but only with 'drop', 'mode' and str
                                       # (replace with str)

    # iterative imputation is automatically ignored if imputation_type='simple' or None
    'iterative_imputation_iters': 10,  # (int), default=5, number of iterations
    'numeric_iterative_imputer': 'lightgbm',  # (str or sklearn estimator), default='lightgbm',
                                             # the regression algorithm for numeric imputation
    'categorical_iterative_imputer': 'lightgbm',  # (str or sklearn estimator), default='lightgbm'

    
    #### Text encoding ####
    'text_features_method': 'tf-idf',  # (str), default='tf-idf', alternative 'bow'
    'max_encoding_ohe': 25,  # (int), default=25, cat. columns with less than specified value
                                # will be encoded with OneHotEncoding.
    'encoding_method': None,  # (category-encoders estimator), default=None, 
                              # for cat. cols with more unique values than 'max_encoding_ohe',
                              # if none, then default = leave_one_out.LeaveOneOutEncoder

    
    #### Feature engineering ####
    'low_variance_threshold': None,  # (float or none), default=None, 
                                     # variance threshold for features, features
                                     # with lower variance are discarded -- if none, keep all features.
    'remove_multicollinearity': False, # (bool), default=False, use correlation as threshold for feature selection
    'multicollinearity_threshold': 0.00,  # (float), default=0.9, use if setting above is true
    
    'bin_numeric_features': None, # (string[]), default=None, convert numeric features into categorical.
    'remove_outliers': False,  # (bool), default=False, remove outliers using an isolation forest.
    'outliers_method': 'iforest',  # (string), default='iforest', alternatives:
                                    # 'iforest': sklearn's IsolationForest
                                    # 'ee': sklearn's EllipticEnvelope
                                    # 'lof': sklearn's LocalOutlierFactor
    'outliers_threshold': 0.00,  # (float), default=0.05, the percentage of outliers to be removed,
                                # is ignored when 'remove_outliers'=False.
    'fix_imbalance': False,  # (bool) default=False, use SMOTE to fix imbalance target features,
                                # can specify other method with 'fix_imbalance_method'
    'fix_imbalance_method': 'SMOTE',  # (str), default='SMOTE', estimator to use
    
    'transformation': False,  # (bool) default=False, if true apply power transform
                              # to make the data more Gaussian-like
    'transformation_method': 'yeo-johnson',  # (str), default='yeo-johnson'
    
    'normalize': True,  # (bool) default=False, scale data
    'normalize_method': 'zscore',  # (str) default='zscore', alt: 'minmax'
    
    'pca': False,  # (bool) default=False, use principal component analysis
                   # to reduce dimensionality
    'pca_method': 'linear',  # (str) default='linear', alt: 'kernel', 'incremental'
    'pca_components': None,  # (int,float,str,None) default=None, if:
                             # * None: all components are kept
                             # * int: the absolute number of components
                             # * float: the variance limit for explaination
                             # * "mle": use  Minka's MLE to guess dimension,
                             #          only works with pca_method='linear'
    'feature_selection': False,  # (bool) default=False, select features based on a
                                    # feature importance score defined by following param
    'feature_selection_method': 'classic',  # (str) default='classic', if
                                    # * 'univariate': use sklearn SelectKBest
                                    # * 'classic': use sklearn SelectFromModel
                                    # * 'sequential': use sklearn SequentialFeatureSelector
    'feature_selection_estimator': 'lightbm',  # (str, sklearn estimator) default='lightbm',
                                    # the choice of classifier that decides feature importance,
                                    # where the estimator needs to have 'feature_importances'
                                    # or 'coef_attribute' after the fitting. If none, use
                                    # LGBClassifier
                                    # This param. is ignored when method='univariate'
    'n_features_to_select': 0.2,  # (int,float) default=0.2, The max number of features
                                    # to use with feature_selection, only looks at features
                                    # allowed (i.e. not at 'ignore_features') when counting.

    ###### Backend-settings ######

    ### Logging settings ###
    ### Note: have implmented manual loggning
    'log_experiment': False,  # choose logger, alternatives: default='mlflow', 'wandb'
    'experiment_name': f"{meta['id']}-{meta['name']}",  # The experiment name, set as the id-dataset name
    'system_log': folders['log_dir'] + meta['id'],   # system loggin, for debugging
    
    #'experiment_custom_tags': {'Dataset Type': 'Original', 'Dataset ID': meta['id']},  # will be changed to 'Synthetic' when using synthetic data
    #'log_plots': False,  # (bool) default=False, if true analysis plots are saved as image files
    #'log_data': True,  # (bool) default=Flase, log the train & test datasets as a csv file

    #### Hardware settings ####
    'n_jobs': -1, # number of jobs to run in parallel (-1 means use all available processors)
    'use_gpu': True, # (bool or str) default=False, whether the GPU should be used for training

    ### Output settings ###
    'html': True,  # (bool) default=True, prevents runtime display of the monitor,
                    # disable when the env doesn't support IPYTHON
                    # Todo: for real experiment, set verbose to false, to disable output of grids
    'verbose': True,  # (bool) default=True, print information grid?
    'profile': False,  # (bool) default=False, if true it displays an interactive EDA report
    'preprocess': True,  # (bool) default=True, use preprocessing methods within pycaret?

    # (something wrong with this argument, deprecated?)'silent': False, #(bool) default=False, need to be True when executed in a automated setting
    # might not need following, because I will drop the features not neede in preperation of data
    # ignore_features = None # (string[]) default=None, list of columns to be ignored in preporcessing and training
}

#### Define settings for the Synthetic Data Generator
Extracts the column names, and renames fields to field_types (because of implementation issue).

In [15]:
# NOTICE: is deprecated, as of SDV 1.0.0
#field_names = data.columns.to_list()
# Define the dataset specific parameters for the sdg CTGAN()
# Note: can only contain keywords that are accepted by CTGAN() function in sdv
sdg_param = {
    # Metadata on the dataset
    #"field_names": field_names,
    #"primary_key": "Outcome",
    
    # same data as meta_data, however, 
    #the SDG model method uses a different parameter name
    #"columns": meta['meta_data']['fields'],  
    }

### Save for next steps
In the cell below, the dataset meta-data and the settings for preprocessing and model creation is saved as a pickle object in its respective directory. 

In [16]:
# combine then save the objects to '../pickles/settings' directory 
import pickle

data_settings = {
    "meta": meta,
    "setup_param": setup_param,
    "sdg_param": sdg_param,
}

pickle.dump(
    data_settings, 
    open(f"{folders['settings_dir']}{meta['id']}-settings.pkl", 'wb') 
)

data.to_csv(f"{folders['real_dir']}{meta['filename']}", index=False)