In [1]:
import pandas as pd
import numpy as np
import src.utils as utils

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load Config File

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/data.csv',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'index_column': 'Unnamed: 0',
 'output_column': 'SeriousDlqin2yrs',
 'seed': 42,
 'test_size': 0.2,
 'clean_late_col': 'NumberOfTimes90DaysLate',
 'clean_late_val': 96,
 'clean_unsecure_col': 'RevolvingUtilizationOfUnsecuredLines',
 'constant_imputer_col': 'NumberOfDependents',
 'constant_imputer_path': 'data/preprocess/constant_imputer.pkl',
 'constant_imputer_val': 0.0,
 'median_imputer_col': 'MonthlyIncome',
 'median_imputer_path': 'data/preprocess/median_imputer.pkl',
 'standardizer_path': 'data/preprocess/standardizer.pkl',
 'preprocessor_path': 'data/preprocess/preprocessor.pkl',
 'train_clean_path': ['data/p

# Plan Preprocessing

**Summary EDA**:
- Data yang `NumberOfTimes90DaysLate == 96` atau `NumberOfTimes90DaysLate == 98` di delete
- Data yang `RevolvingUtilizationOfUnsecuredLines > 1.35` di delete
- Missing value imputation 
  - `MonthlyIncome` : Median
  - `NumberOfDependents` : 0.0

# Load Dataset

In [3]:
def load_dataset(return_file=True):
    # Load train data
    X_train = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Load test data
    X_test = utils.pickle_load(CONFIG_DATA['test_set_path'][0])
    y_test = utils.pickle_load(CONFIG_DATA['test_set_path'][1])

    # Print 
    print("X_train shape :", X_train.shape)
    print("y_train shape :", y_train.shape)
    print("X_test shape  :", X_test.shape)
    print("y_test shape  :", y_test.shape)

    if return_file:
        return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = load_dataset()

X_train shape : (120000, 10)
y_train shape : (120000,)
X_test shape  : (30000, 10)
y_test shape  : (30000,)


# Preprocess Train

1. Data yang `NumberOfTimes90DaysLate == 96` atau `NumberOfTimes90DaysLate == 98` di delete

In [5]:
def clean_late_data(X, y):
    """Function to clean NumberOfTimes90DaysLate columns"""
    # Find index to drop
    drop_condition = X[CONFIG_DATA['clean_late_col']] >= CONFIG_DATA['clean_late_val']
    index_to_drop = X[drop_condition].index.tolist()

    # Drop data
    X_drop = X.drop(index = index_to_drop)
    y_drop = y.drop(index = index_to_drop)

    # Print
    print("X shape :", X_drop.shape)
    print("y shape :", y_drop.shape)

    return X_drop, y_drop

In [6]:
X_train_drop_1, y_train_drop_1 = clean_late_data(X_train, y_train)

X shape : (119785, 10)
y shape : (119785,)


2. Data yang `RevolvingUtilizationOfUnsecuredLines > 1.35` di delete

In [7]:
def clean_unsecured_data(X, y):
    """Function to clean RevolvingUtilizationOfUnsecuredLines columns from outlier"""
    # Find upper boundary & lower boundary
    q1, q3 = np.quantile(X[CONFIG_DATA['clean_unsecure_col']], q = [0.25, 0.75])
    iqr = q3-q1
    ub = q3 + 1.5*iqr
    lb = q1 - 1.5*iqr

    # Filter data
    drop_condition_1 = X[CONFIG_DATA['clean_unsecure_col']] > ub
    drop_condition_2 = X[CONFIG_DATA['clean_unsecure_col']] < lb
    index_to_drop = X[drop_condition_1 | drop_condition_2].index.tolist()

    # Drop data
    X_drop = X.drop(index = index_to_drop)
    y_drop = y.drop(index = index_to_drop)

    # Print
    print("X shape :", X_drop.shape)
    print("y shape :", y_drop.shape)

    return X_drop, y_drop


In [8]:
X_train_drop_2, y_train_drop_2 = clean_unsecured_data(X_train_drop_1, y_train_drop_1)

X shape : (119173, 10)
y shape : (119173,)


3. Missing value imputation 
  - `MonthlyIncome` : Median
  - `NumberOfDependents` : 0.0

In [9]:
def fit_imputer(data, return_file=True):
    """Function to fit imputer (constant & median)"""
    # Create imputer
    constant_imputer = SimpleImputer(missing_values = np.nan,
                                     strategy = "constant",
                                     fill_value = CONFIG_DATA['constant_imputer_val'])
    
    median_imputer = SimpleImputer(missing_values = np.nan,
                                   strategy = "median")
    
    # Fit imputer
    constant_imputer.fit(data[[CONFIG_DATA['constant_imputer_col']]])
    median_imputer.fit(data[[CONFIG_DATA['median_imputer_col']]])

    # Dump imputer
    utils.pickle_dump(constant_imputer, CONFIG_DATA['constant_imputer_path'])
    utils.pickle_dump(median_imputer, CONFIG_DATA['median_imputer_path'])

    if return_file:
        return constant_imputer, median_imputer
    

In [10]:
# Fit imputer
constant_imputer, median_imputer = fit_imputer(data = X_train_drop_2)

In [11]:
def transform_imputer(data, constant_imputer, median_imputer):
    """Function to transform imputer"""
    data = data.copy()

    # Transform
    impute_constant = constant_imputer.transform(data[[CONFIG_DATA['constant_imputer_col']]])
    impute_median = median_imputer.transform(data[[CONFIG_DATA['median_imputer_col']]])

    # Join transformed data
    data[CONFIG_DATA['constant_imputer_col']] = impute_constant
    data[CONFIG_DATA['median_imputer_col']] = impute_median
    
    # print
    print('data shape :', data.shape)

    return data

In [12]:
X_train_imputed = transform_imputer(data = X_train_drop_2,
                                    constant_imputer = constant_imputer,
                                    median_imputer = median_imputer)

data shape : (119173, 10)


In [13]:
X_train_imputed.isna().sum()

RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

4. Standardize data

In [14]:
def fit_standardize(data, return_file=True):
    """Find standardizer data"""
    standardizer = StandardScaler()

    # Fit standardizer
    standardizer.fit(data)

    # Dump standardizer
    utils.pickle_dump(standardizer, CONFIG_DATA['standardizer_path'])
    
    if return_file:
        return standardizer

In [15]:
# Fit standardizer
standardizer = fit_standardize(data=X_train_imputed)

In [16]:
def transform_standardize(data, standardizer):
    """Function to standardize data"""
    data_standard = pd.DataFrame(standardizer.transform(data))
    data_standard.columns = data.columns
    data_standard.index = data.index
    return data_standard


In [17]:
# Transform
X_train_clean = transform_standardize(data = X_train_imputed,
                                      standardizer = standardizer)

In [18]:
X_train_clean.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0
mean,-3.350801e-17,-1.088116e-16,1.836382e-17,2.7337050000000002e-17,1.529325e-17,1.9496650000000002e-17,7.083188e-17,-7.894058e-17,1.538268e-17,5.747637e-17
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004
min,-0.9062235,-2.125699,-0.3512391,-0.1807954,-0.4677485,-1.651978,-0.184746,-0.9033765,-0.1940611,-0.6664416
25%,-0.821027,-0.7693667,-0.3512391,-0.1807048,-0.1829973,-0.6791933,-0.184746,-0.9033765,-0.1940611,-0.6664416
50%,-0.4707942,-0.02338383,-0.3512391,-0.1806066,-0.07508849,-0.09552263,-0.184746,-0.02070164,-0.1940611,-0.6664416
75%,0.6609987,0.7225991,-0.3512391,-0.1803506,0.07114099,0.4881481,-0.184746,0.8619733,-0.1940611,0.2358948
max,2.946689,3.842164,16.86551,167.487,218.3129,9.632322,36.13239,46.76107,34.06331,11.06393


Drop all preprocessor

In [19]:
preprocessor = {
    'constant_imputer': constant_imputer,
    'median_imputer': median_imputer,
    'standardizer': standardizer
}

utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])

# Preprocess all

In [20]:
def clean_data(data, constant_imputer, median_imputer, standardizer):
    """Function to clean data"""
    # Impute missing value
    data_imputed = transform_imputer(data, constant_imputer, median_imputer)

    # Standardize data
    data_standard = transform_standardize(data_imputed, standardizer)

    return data_standard

In [33]:
def _preprocess_data(data):
    """Function to preprocess data"""
    # Load preprocessor
    preprocessor = utils.pickle_load(CONFIG_DATA['preprocessor_path'])
    constant_imputer = preprocessor['constant_imputer']
    median_imputer = preprocessor['median_imputer']
    standardizer = preprocessor['standardizer']

    data_clean = clean_data(data,
                            constant_imputer,
                            median_imputer,
                            standardizer)
    
    return data_clean

# Preprocess all

Generate preprocessor

In [34]:
def generate_preprocessor(return_file=True):
    """Function to generate preprocessor"""
    # Load data
    X = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Drop unusual data
    X, y = clean_late_data(X, y)
    X, y = clean_unsecured_data(X, y)

    # Generate preprocessor: imputer
    constant_imputer, median_imputer = fit_imputer(data = X)
    X_imputed = transform_imputer(X, constant_imputer, median_imputer)

    # Generate preprocessor: standardizer
    standardizer = fit_standardize(X_imputed)

    # Dump file
    preprocessor = {
        'constant_imputer': constant_imputer,
        'median_imputer': median_imputer,
        'standardizer': standardizer
    }
    utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])
    
    if return_file:
        return preprocessor
    

In [35]:
preprocessor = generate_preprocessor()

X shape : (119785, 10)
y shape : (119785,)
X shape : (119173, 10)
y shape : (119173,)
data shape : (119173, 10)


For X train

In [36]:
def preprocess_data(type='train', return_file=True):
    """Function to preprocess train data"""
    # Load data
    X = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][1])

    if type == 'train':
        # Drop unusual data
        X, y = clean_late_data(X, y)
        X, y = clean_unsecured_data(X, y)
        
    # Preprocess data
    X_clean = _preprocess_data(X)
    y_clean = y

    # Print shape
    print("X clean shape:", X_clean.shape)
    print("y clean shape:", y_clean.shape)

    # Dump file
    utils.pickle_dump(X_clean, CONFIG_DATA[f'{type}_clean_path'][0])
    utils.pickle_dump(y_clean, CONFIG_DATA[f'{type}_clean_path'][1])

    if return_file:
        return X_clean, y_clean    

In [38]:
# Transform X_train
X_train_clean, y_train_clean = preprocess_data(type = 'train')

X shape : (119785, 10)
y shape : (119785,)
X shape : (119173, 10)
y shape : (119173,)
data shape : (119173, 10)
X clean shape: (119173, 10)
y clean shape: (119173,)


In [39]:
# Transform X_train
X_test_clean, y_test_clean = preprocess_data(type = 'test')

data shape : (30000, 10)
X clean shape: (30000, 10)
y clean shape: (30000,)


In [40]:
X_train_clean.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0,119173.0
mean,-3.350801e-17,-1.088116e-16,1.836382e-17,2.7337050000000002e-17,1.529325e-17,1.9496650000000002e-17,7.083188e-17,-7.894058e-17,1.538268e-17,5.747637e-17
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004
min,-0.9062235,-2.125699,-0.3512391,-0.1807954,-0.4677485,-1.651978,-0.184746,-0.9033765,-0.1940611,-0.6664416
25%,-0.821027,-0.7693667,-0.3512391,-0.1807048,-0.1829973,-0.6791933,-0.184746,-0.9033765,-0.1940611,-0.6664416
50%,-0.4707942,-0.02338383,-0.3512391,-0.1806066,-0.07508849,-0.09552263,-0.184746,-0.02070164,-0.1940611,-0.6664416
75%,0.6609987,0.7225991,-0.3512391,-0.1803506,0.07114099,0.4881481,-0.184746,0.8619733,-0.1940611,0.2358948
max,2.946689,3.842164,16.86551,167.487,218.3129,9.632322,36.13239,46.76107,34.06331,11.06393
