# Step 1: Data preperation, D2-bank

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from ydata_profiling import ProfileReport

import os
import sys
import yaml

# Display all available datsets
for dirname, _, filenames in os.walk('../data/raw/'):
    for filename in filenames:
        display(os.path.join(dirname, filename))

'../data/raw/.gitkeep'

'../data/raw/bank-full.csv'

'../data/raw/bank-names.txt'

'../data/raw/bank.csv'

'../data/raw/bank.zip'

'../data/raw/dataset_us_diabetes.zip'

'../data/raw/diabetes.csv'

'../data/raw/health-insurance.zip'

'../data/raw/mnist.zip'

'../data/raw/titanic.csv'

'../data/raw/mnist\\mnist_test.csv'

'../data/raw/mnist\\mnist_train.csv'

In [2]:
sys.path.append('../src')
from utils import getExperimentConfig

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']

In this section the data will be examined for selecting the preprocessing and model of the original dataset. This pipeline of preprocessing will then be save for executing on the respective synthetic dataset.


This section will be done independently for each dataset that will be explored, with the hopes that rest of the steps of the experiment can be automized.

In [3]:
data_filename = "bank-full.csv"
data_id = "D2"
data_name = "bank"
data = pd.read_csv(f"{folders['raw_dir']}{data_filename}", sep=';')
pd.set_option('display.max_columns', None)

display(data.info(verbose=True, memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 29.2 MB


None

In [4]:
# set cols_dtype for optimized memory usage
cols_dtype = {
    'age': 'uint8',  # Numeric, not exceeding 255
    'job': 'category',  # Categorical
    'marital': 'category',  # Categorical
    'education': 'category',  # Categorical
    'default': 'category',  # Binary, treated as categorical
    'balance': 'int64',  # Numeric (may include negative values)
    'housing': 'category',  # Binary, treated as categorical
    'loan': 'category',  # Binary, treated as categorical
    'contact': 'category',  # Categorical
    'day': 'uint8',  # Numeric, not exceeding 31
    'month': 'category',  # Categorical (treated as categorical since it's not ordinal in this context)
    'duration': 'int64',  # Numeric (may include negative values)
    'campaign': 'uint16',  # Numeric, non-negative
    'pdays': 'int16',  # Numeric (may include -1)
    'previous': 'uint16',  # Numeric, non-negative
    'poutcome': 'category',  # Categorical
    'y': 'category'  # Binary, treated as categorical
}


data = pd.read_csv(f"{folders['raw_dir']}{data_filename}", sep=';', dtype=cols_dtype)
data.info(verbose=True, memory_usage='deep')
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        45211 non-null  uint8   
 1   job        45211 non-null  category
 2   marital    45211 non-null  category
 3   education  45211 non-null  category
 4   default    45211 non-null  category
 5   balance    45211 non-null  int64   
 6   housing    45211 non-null  category
 7   loan       45211 non-null  category
 8   contact    45211 non-null  category
 9   day        45211 non-null  uint8   
 10  month      45211 non-null  category
 11  duration   45211 non-null  int64   
 12  campaign   45211 non-null  uint16  
 13  pdays      45211 non-null  int16   
 14  previous   45211 non-null  uint16  
 15  poutcome   45211 non-null  category
 16  y          45211 non-null  category
dtypes: category(10), int16(1), int64(2), uint16(2), uint8(2)
memory usage: 1.5 MB


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Saved more than 20 MB by defining cols_dtyp, while still retaining all information.

In [5]:
data.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,45211.0,40.94,10.62,18.0,33.0,39.0,48.0,95.0
balance,45211.0,1362.27,3044.77,-8019.0,72.0,448.0,1428.0,102127.0
day,45211.0,15.81,8.32,1.0,8.0,16.0,21.0,31.0
duration,45211.0,258.16,257.53,0.0,103.0,180.0,319.0,4918.0
campaign,45211.0,2.76,3.1,1.0,1.0,2.0,3.0,63.0
pdays,45211.0,40.2,100.13,-1.0,-1.0,-1.0,-1.0,871.0
previous,45211.0,0.58,2.3,0.0,0.0,0.0,0.0,275.0


In [6]:
#from pycaret.classification import *

# run eda
#s = setup(data, target='y', verbose=True)
#eda()

In [None]:
ProfileReport(data, minimal=True, explorative=True)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print("=== Null values: ===\n")
display(data.isnull().sum())
print("\n=== Data types: === \n")
display(data.info(memory_usage='deep'))

In [None]:
# Replace 'yes' with True and 'no' with False 
data['default'] = data['default'].replace({'yes': True, 'no': False})
data['housing'] = data['housing'].replace({'yes': True, 'no': False})
data['loan'] = data['loan'].replace({'yes': True, 'no': False})
# replace target label with 1 0 instead of True False
data['y'] = data['y'].replace({'yes': 1, 'no': 0})

#update boolean columns
cols_dtype['default'] = 'boolean'
cols_dtype['housing'] = 'boolean'
cols_dtype['loan'] = 'boolean'
cols_dtype['y'] = 'category'

display(data.head())
display(data.info(verbose=True, memory_usage='deep'))

### Define metadata for the dataset
The following cells in this section is for defining the dataset specific settings that are needed to run the following experiment.

> NOTICE:
*The meta dictionary gets updated in Step 3: SDG, where metadata about each synthetic data that is generated on the respective real data. Data is appended to 'sd_meta_list' key.
This is then saved over the current settings.*

In [None]:
# metadata for the SDG
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

In [None]:
# Display metadata & validate
display(metadata)

display(metadata.validate())


In [None]:
metadata.update_column(
    column_name='age',
    sdtype='numerical',
    computer_representation='UInt8')
metadata.update_column(
    column_name='job',
    sdtype='categorical')
metadata.update_column(
    column_name='marital',
    sdtype='categorical')
metadata.update_column(
    column_name='education',
    sdtype='categorical')
metadata.update_column(
    column_name='default',
    sdtype='boolean')
metadata.update_column(
    column_name='balance',
    sdtype='numerical',
    computer_representation='Int64')
metadata.update_column(
    column_name='housing',
    sdtype='boolean')
metadata.update_column(
    column_name='loan',
    sdtype='boolean')
metadata.update_column(
    column_name='contact',
    sdtype='categorical')
metadata.update_column(
    column_name='day',
    sdtype='numerical',
    computer_representation='UInt8')
metadata.update_column(
    column_name='month',
    sdtype='categorical')
metadata.update_column(
    column_name='duration',
    sdtype='numerical',
    computer_representation='Int64')
metadata.update_column(
    column_name='campaign',
    sdtype='numerical',
    computer_representation='Int32')
metadata.update_column(
    column_name='pdays',
    sdtype='numerical',
    computer_representation='Int16')
metadata.update_column(
    column_name='previous',
    sdtype='numerical',
    computer_representation='UInt16')
metadata.update_column(
    column_name='poutcome',
    sdtype='categorical')
metadata.update_column(
    column_name='y',
    sdtype='categorical')

display(metadata.validate())

In [None]:
display(metadata)
display(data.head())

In [None]:
########## Define dataset id and save metadata
meta_filepath = f"{folders['meta_dir']}{data_id}"

try:
    metadata.save_to_json(meta_filepath)

except:
    print(f"File {meta_filepath} already exits and has been replaced.")
    os.remove(meta_filepath)
    metadata.save_to_json(meta_filepath)

In [None]:
# Define dataset meta data for the setup parameters in pycaret
# use this to avoid needing to save the whole dataset in a pickle object

# use the parameters to read the data from csv into the setup, e.g.
meta = {
    # Generall
    'name':     data_name,
    'id':       data_id,
    'filename': f"{data_id}-{data_filename}",
    
    'cols_dtype': cols_dtype,
    
    # Pycaret
    'target': 'y',
    
    'categorical_features': [
        'job',
        'marital',
        'default',
        'housing',
        'loan',
        'contact',
        'poutcome'
    ],
    
    'ordinal_features': {
        'education': ["unknown", "primary", "secondary", "tertiary"],
        'month': ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    },
    
    'numeric_features': [
        'age',
        'balance',
        'day',
        'duration',
        'campaign',
        'pdays',
        'previous'
    ],
    'text_features': None,

    'meta_filepath': meta_filepath,
    
}


> Note on Iterative imputation that exists in pycaret:
*Iterative imputation is a imputation method that for each feature, sets up a model to predict the missing values with the rest of the features as predictors, then repeatedly does this for each feature with missing values.*

### Define setup parameters for pycaret
Use these settings to instruct for pycaret how to preprocess the data, handle the model training and evaluation. Basically the ML pipeline.

In [None]:
# Define the setup parameters for pycaret setup function, where the details of preprocessing is defined
# Note: can only contain keywords that exists in the settings of the pycaret.setup()

setup_param = {
    'target': meta['target'],

    ### Sampling settings ###
    'train_size': 0.8,  # (float) default=0.7, the train test split
    # used for training and validation
    'fold_strategy': 'stratifiedkfold',  # (srt), default = 'stratifiedkfold',
    # selects cross-validation method
    'fold': config['clf']['cv_folds'],  # (int) default=10, the number of folds

    ### Data-preparation settings ###

    #### Define features (use meta) ####
    'ordinal_features': meta['ordinal_features'],
    'numeric_features': meta['numeric_features'],
    'text_features': meta['text_features'],
    'categorical_features': meta['categorical_features'],

    #### Imputation methods #### 
    #Note: imputation will be performed in step 1, instead of in pycaret
    'imputation_type': None,  # ('simple', 'iterative', None) default='simple'
    'numeric_imputation': 'mean',  # (int, float or str) default='mean',
                        # it's ignored if imputation_type='iterative'
                        # alternatives:
                        #   'drop'      : drops rows with missing values
                        #   'mean'      : replace with mean of column
                        #   'median'    : replace with median of column
                        #   'mode'      : replace with mode of column
                        #   'knn'       : replace with KNN approach
                        #   int or float: replace with provided value
    'categorical_imputation': 'mode',  # same as numeric, but only with 'drop', 'mode' and str
                                       # (replace with str)

    # iterative imputation is automatically ignored if imputation_type='simple' or None
    'iterative_imputation_iters': 10,  # (int), default=5, number of iterations
    'numeric_iterative_imputer': 'lightgbm',  # (str or sklearn estimator), default='lightgbm',
                                             # the regression algorithm for numeric imputation
    'categorical_iterative_imputer': 'lightgbm',  # (str or sklearn estimator), default='lightgbm'

    
    #### Text encoding ####
    'text_features_method': 'tf-idf',  # (str), default='tf-idf', alternative 'bow'
    'max_encoding_ohe': 20,  # (int), default=25, cat. columns with less than specified value
                                # will be encoded with OneHotEncoding.
    'encoding_method': None,  # (category-encoders estimator), default=None, 
                              # for cat. cols with more unique values than 'max_encoding_ohe',
                              # if none, then default = leave_one_out.LeaveOneOutEncoder

    
    #### Feature engineering ####
    'low_variance_threshold': None,  # (float or none), default=None, 
                                     # variance threshold for features, features
                                     # with lower variance are discarded -- if none, keep all features.
    'remove_multicollinearity': False, # (bool), default=False, use correlation as threshold for feature selection
    'multicollinearity_threshold': 0.01,  # (float), default=0.9, use if setting above is true
    
    'bin_numeric_features': None, # (string[]), default=None, convert numeric features into categorical.
    'remove_outliers': False,  # (bool), default=False, remove outliers using an isolation forest.
    'outliers_method': 'iforest',  # (string), default='iforest', alternatives:
                                    # 'iforest': sklearn's IsolationForest
                                    # 'ee': sklearn's EllipticEnvelope
                                    # 'lof': sklearn's LocalOutlierFactor
    'outliers_threshold': 0.05,  # (float), default=0.05, the percentage of outliers to be removed,
                                # is ignored when 'remove_outliers'=False.
    'fix_imbalance': False,  # (bool) default=False, use SMOTE to fix imbalance target features,
                                # can specify other method with 'fix_imbalance_method'
    'fix_imbalance_method': 'SMOTE',  # (str), default='SMOTE', estimator to use
    
    'transformation': False,  # (bool) default=False, if true apply power transform
                              # to make the data more Gaussian-like
    'transformation_method': 'yeo-johnson',  # (str), default='yeo-johnson'
    
    'normalize': True,  # (bool) default=False, scale data
    'normalize_method': 'zscore',  # (str) default='zscore', alt: 'minmax'
    
    'pca': False,  # (bool) default=False, use principal component analysis
                   # to reduce dimensionality
    'pca_method': 'linear',  # (str) default='linear', alt: 'kernel', 'incremental'
    'pca_components': None,  # (int,float,str,None) default=None, if:
                             # * None: all components are kept
                             # * int: the absolute number of components
                             # * float: the variance limit for explaination
                             # * "mle": use  Minka's MLE to guess dimension,
                             #          only works with pca_method='linear'
    'feature_selection': False,  # (bool) default=False, select features based on a
                                    # feature importance score defined by following param
    'feature_selection_method': 'classic',  # (str) default='classic', if
                                    # * 'univariate': use sklearn SelectKBest
                                    # * 'classic': use sklearn SelectFromModel
                                    # * 'sequential': use sklearn SequentialFeatureSelector
    'feature_selection_estimator': 'lightbm',  # (str, sklearn estimator) default='lightbm',
                                    # the choice of classifier that decides feature importance,
                                    # where the estimator needs to have 'feature_importances'
                                    # or 'coef_attribute' after the fitting. If none, use
                                    # LGBClassifier
                                    # This param. is ignored when method='univariate'
    'n_features_to_select': 0.2,  # (int,float) default=0.2, The max number of features
                                    # to use with feature_selection, only looks at features
                                    # allowed (i.e. not at 'ignore_features') when counting.

    ###### Backend-settings ######

    ### Logging settings ###
    ### Note: have implmented manual loggning
    'log_experiment': False,  # choose logger, alternatives: default='mlflow', 'wandb'
    'experiment_name': f"{meta['id']}-{meta['name']}",  # The experiment name, set as the id-dataset name
    'system_log': folders['log_dir'] + meta['id'],   # system loggin, for debugging
    
    #'experiment_custom_tags': {'Dataset Type': 'Original', 'Dataset ID': meta['id']},  # will be changed to 'Synthetic' when using synthetic data
    #'log_plots': False,  # (bool) default=False, if true analysis plots are saved as image files
    #'log_data': True,  # (bool) default=Flase, log the train & test datasets as a csv file

    #### Hardware settings ####
    'n_jobs': -1, # number of jobs to run in parallel (-1 means use all available processors)
    'use_gpu': True, # (bool or str) default=False, whether the GPU should be used for training

    ### Output settings ###
    'html': True,  # (bool) default=True, prevents runtime display of the monitor,
                    # disable when the env doesn't support IPYTHON
                    # Todo: for real experiment, set verbose to false, to disable output of grids
    'verbose': True,  # (bool) default=True, print information grid?
    'profile': False,  # (bool) default=False, if true it displays an interactive EDA report
    'preprocess': True,  # (bool) default=True, use preprocessing methods within pycaret?

    # (something wrong with this argument, deprecated?)'silent': False, #(bool) default=False, need to be True when executed in a automated setting
    # might not need following, because I will drop the features not neede in preperation of data
    # ignore_features = None # (string[]) default=None, list of columns to be ignored in preporcessing and training
}

#### Define settings for the Synthetic Data Generator
Extracts the column names, and renames fields to field_types (because of implementation issue).

In [None]:
# NOTICE: is deprecated, as of SDV 1.0.0
#field_names = data.columns.to_list()
# Define the dataset specific parameters for the sdg CTGAN()
# Note: can only contain keywords that are accepted by CTGAN() function in sdv
sdg_param = {
    # Metadata on the dataset
    #"field_names": field_names,
    #"primary_key": "Outcome",
    
    # same data as meta_data, however, 
    #the SDG model method uses a different parameter name
    #"columns": meta['meta_data']['fields'],  
    }

### Save for next steps
In the cell below, the dataset meta-data and the settings for preprocessing and model creation is saved as a pickle object in its respective directory. 

In [None]:
# combine then save the objects to '../pickles/settings' directory 
import pickle

data_settings = {
    "meta": meta,
    "setup_param": setup_param,
    "sdg_param": sdg_param,
}

pickle.dump(
    data_settings, 
    open(f"{folders['settings_dir']}{meta['id']}-settings.pkl", 'wb') 
)

data.to_csv(f"{folders['real_dir']}{meta['filename']}", index=False)

In [None]:
# check if saved correctly
cols_dtypet=None

if 'cols_dtype' in meta:
    cols_dtypedt=meta['cols_dtype']
d = pd.read_csv(f"{folders['real_dir']}{meta['filename']}", dtype=cols_dtype)

d.info(verbose=True, memory_usage='deep')
d['y']
