In [1]:
# import library
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from imblearn.under_sampling import RandomUnderSampler

# Plan preprocessing

**Summary EDA**:
- Data yang `NumberOfTimes90DaysLate == 96` atau `NumberOfTimes90DaysLate == 98` di delete
- Data yang `RevolvingUtilizationOfUnsecuredLines > 1.35` di delete
- Missing value imputation 
  - `MonthlyIncome` : Median
  - `NumberOfDependents` : 0.0

## Load dataset

In [2]:
# load the dataset function
def load_dataset():
  # load train data
  X_train = joblib.load('data/output/X_train.pkl')
  y_train = joblib.load('data/output/y_train.pkl')
  
  # load valid data
  X_valid = joblib.load('data/output/X_valid.pkl')
  y_valid = joblib.load('data/output/y_valid.pkl')
  
  # load test data
  X_test = joblib.load('data/output/X_test.pkl')
  y_test = joblib.load('data/output/y_test.pkl')
  
  # print shape
  print('X_train shape :', X_train.shape)
  print('y_train shape :', y_train.shape)
  print('X_valid shape :', X_valid.shape)
  print('y_valid shape :', y_valid.shape)
  print('X_test shape :', X_test.shape)
  print('y_test shape :', y_test.shape)
  return X_train, X_valid, X_test, y_train, y_valid, y_test

In [3]:
# running load dataset function
X_train, X_valid, X_test, y_train, y_valid, y_test = load_dataset()

X_train shape : (96000, 10)
y_train shape : (96000,)
X_valid shape : (24000, 10)
y_valid shape : (24000,)
X_test shape : (30000, 10)
y_test shape : (30000,)


## Preprocess Train

1. Drop values in `NumberOfTimes90DaysLate` bigger than or equal to 96

In [4]:
# clean late data function
def clean_late_data(X, y):
  """Function to clean NumberOfTimes90DaysLate columns"""
  # find index drop
  drop_condition = X['NumberOfTimes90DaysLate'] >= 96
  index_to_drop = X[drop_condition].index.tolist()
  
  # drop data
  X_drop = X.drop(index = index_to_drop)
  y_drop = y.drop(index = index_to_drop)
  
  # print 
  print('X shape :', X_drop.shape)
  print('y shape :', y_drop.shape)
  
  return X_drop, y_drop

In [5]:
# running the claen date data in data train
X_train_drop_1, y_train_drop_1 = clean_late_data(X_train, y_train)

X shape : (95838, 10)
y shape : (95838,)


2. Data yang `RevolvingUtilizationOfUnsecuredLines > 1.35` di delete

In [6]:
# clean unsecured data function
def clean_unsecured_data(X, y):
  """Function to clean RevolvingUtilizationOfUnsecuredLines columns from outlier"""
  q1, q3 = np.quantile(X['RevolvingUtilizationOfUnsecuredLines'], q = [0.25, 0.75])
  iqr = q3 - q1
  ub = q3 + 1.5*iqr
  lb = q1 - 1.5*iqr
  
  # filter data
  drop_condition_1 = X['RevolvingUtilizationOfUnsecuredLines'] > ub
  drop_condition_2 = X['RevolvingUtilizationOfUnsecuredLines'] < lb
  index_to_drop = X[drop_condition_1 | drop_condition_2].index.tolist()
  
  # drop data 
  X_drop = X.drop(index = index_to_drop)
  y_drop = y.drop(index = index_to_drop)
  
  # print
  print('X shape :', X_drop.shape)
  print('y shape :', y_drop.shape)
  
  return X_drop, y_drop

In [7]:
# running the clean unsecured data function
X_train_drop_2, y_train_drop_2 = clean_unsecured_data(X_train_drop_1, y_train_drop_1)

X shape : (95350, 10)
y shape : (95350,)


3. Missing value imputation 
  - `MonthlyIncome` : Median
  - `NumberOfDependents` : 0.0

In [8]:
# fit imputer function
def fit_imputer(data):
  """Function to fit imputer (constant & median)"""
  # create imputer zero and median
  constant_imputer = SimpleImputer(missing_values = np.nan,
                                   strategy= 'constant',
                                   fill_value= 0.0)
  
  median_imputer = SimpleImputer(missing_values = np.nan,
                                 strategy = 'median')
  
  # fit imputer
  constant_imputer.fit(data[['NumberOfDependents']])
  median_imputer.fit(data[['MonthlyIncome']])
  
  # dump imputer 
  joblib.dump(constant_imputer, 'data/output/constant_imputer.pkl')
  joblib.dump(median_imputer, 'data/output/median_imputer.pkl')
  
  return constant_imputer, median_imputer

In [9]:
# running the fit imputer function
constant_imputer, median_imputer = fit_imputer(data = X_train_drop_2)

In [10]:
# show number of missing value in X_train_drop_2
X_train_drop_2.isna().sum()

RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           18752
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       2466
dtype: int64

In [11]:
# transform imputer function
def transform_imputer(data, constant_imputer, median_imputer):
  """Function to transform imputer"""
  data = data.copy()
  
  # build impute constant and median variable
  impute_constant = constant_imputer.transform(data[['NumberOfDependents']])
  impute_median = median_imputer.transform(data[['MonthlyIncome']])
  
  # applied impute constant and impute median in NumberOfDependents and MonthlyIncome 
  data['NumberOfDependents'] = impute_constant
  data['MonthlyIncome'] = impute_median
  
  # print shape of data
  print('data shape :', data.shape)
  
  return data

In [12]:
# running the tranform imputer function
X_train_imputed = transform_imputer(data=X_train_drop_2,
                                    constant_imputer = constant_imputer,
                                    median_imputer = median_imputer)

data shape : (95350, 10)


In [13]:
# data train don't have missing values
X_train_imputed.isna().sum()

RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

4. Standardize data

In [14]:
# standardize function
def fit_standardize(data):
  """Find standardizer data"""
  standardizer = StandardScaler()
  
  # fit standardizer
  standardizer.fit(data)
  
  # dump standardizer 
  joblib.dump(standardizer, 'data/output/standardizer.pkl')
  
  return standardizer

In [15]:
# fit standardizer 
standardizer = fit_standardize(data=X_train_imputed)

In [16]:
# show standardizer
standardizer

In [17]:
# transform standardize function
def transform_standardize(data, standardizer):
  """Function to standardizer data"""
  data_standard = pd.DataFrame(standardizer.transform(data))
  data_standard.columns = data.columns
  data_standard.index = data.index
  return data_standard

In [18]:
# Transform standardize
X_train_std = transform_standardize(data = X_train_imputed,
                                     standardizer = standardizer)

In [19]:
# show X_train_std 
X_train_std

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
145776,-0.660534,0.516647,-0.351172,-0.170463,-0.433419,0.293983,-0.185559,0.863388,-0.194653,-0.665523
61313,-0.885124,-0.297311,-0.351172,-0.173321,0.234476,1.461602,-0.185559,0.863388,-0.194653,-0.665523
128371,-0.769224,0.652307,-0.351172,2.929988,-0.074690,2.045412,-0.185559,2.629354,-0.194653,-0.665523
48243,-0.760501,-1.314759,-0.351172,2.214684,-0.074690,0.099379,-0.185559,1.746371,-0.194653,-0.665523
54026,-0.465572,-0.161651,-0.351172,-0.173276,-0.177184,0.099379,-0.185559,-0.019595,-0.194653,-0.665523
...,...,...,...,...,...,...,...,...,...,...
49714,1.962636,-1.993058,-0.351172,-0.173322,-0.371556,-1.457447,-0.185559,-0.902578,-0.194653,-0.665523
87940,-0.307005,-0.025992,-0.351172,0.784075,-0.074690,-0.095224,-0.185559,-0.019595,-0.194653,-0.665523
138289,1.408749,-0.907780,-0.351172,0.460108,-0.074690,0.099379,-0.185559,-0.902578,-0.194653,-0.665523
78494,-0.265850,0.380988,-0.351172,-0.173402,0.028024,-0.484431,-0.185559,-0.902578,-0.194653,0.238818


In [20]:
# statistical summary in X_train_std
X_train_std.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0
mean,8.395545e-17,-5.305783e-17,3.8153950000000005e-17,-4.471166e-18,2.2132270000000003e-17,-1.080532e-16,-3.87501e-18,7.515284e-17,-3.5918360000000006e-17,-6.326699e-17
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-0.9054568,-2.128718,-0.3511723,-0.1734617,-0.4700236,-1.65205,-0.1855588,-0.9025782,-0.1946529,-0.6655234
25%,-0.820495,-0.7721203,-0.3511723,-0.1733744,-0.1844315,-0.6790339,-0.1855588,-0.9025782,-0.1946529,-0.6655234
50%,-0.4718411,-0.02599161,-0.3511723,-0.1732795,-0.07468973,-0.09522398,-0.1855588,-0.01959509,-0.1946529,-0.6655234
75%,0.6590006,0.720137,-0.3511723,-0.173033,0.07173021,0.4885859,-0.1855588,0.863388,-0.1946529,0.2388183
max,2.940152,3.840311,16.93327,161.5339,219.8005,9.634941,31.96411,46.77851,34.04343,11.09092


Balancing data

In [21]:
# check balancing data
y_train_drop_2.value_counts(normalize=True)

0    0.93526
1    0.06474
Name: SeriousDlqin2yrs, dtype: float64

Resampling using downsampling (only for training data)

In [22]:
# random undersampler function
def random_undersampler(X, y):
  """Function to under sample the majority data"""
  # Create resampling object
  ros = RandomUnderSampler(random_state = 42)
  
  # Balancing the set data
  X_resample, y_resample = ros.fit_resample(X, y)
  
  # print
  print('Distribution before resampling :')
  print(y.value_counts())
  print('Distribution after resampling :')
  print(y_resample.value_counts())
  
  return X_resample, y_resample

In [23]:
# running random undersampler function
X_train_clean, y_train_clean = random_undersampler(X_train_std, y_train_drop_2)

Distribution before resampling :
0    89177
1     6173
Name: SeriousDlqin2yrs, dtype: int64
Distribution after resampling :
0    6173
1    6173
Name: SeriousDlqin2yrs, dtype: int64


Drop all preprocessor

In [24]:
preprocessor = {
  'constant_imputer' : constant_imputer,
  'median_imputer' : median_imputer,
  'standardizer' : standardizer
}

joblib.dump(preprocessor, 'data/output/preprocessor.pkl')

['data/output/preprocessor.pkl']

# Preprocess all

In [25]:
# clean data function
def clean_data(data, constant_imputer, median_imputer, standardizer):
  """Function to clean data"""
  # impute missing value
  data_imputed = transform_imputer(data, constant_imputer, median_imputer)
  
  # standardize data
  data_standard = transform_standardize(data_imputed, standardizer)
  
  return data_standard

In [26]:
# _preprocess_data function
def _preprocess_data(data):
  """Function to preprocess data"""
  # load preprocess
  preprocessor = joblib.load('data/output/preprocessor.pkl')
  constant_imputer = preprocessor['constant_imputer']
  median_imputer = preprocessor['median_imputer']
  standardizer = preprocessor['standardizer']
  
  data_clean = clean_data(data,
                          constant_imputer,
                          median_imputer,
                          standardizer)
  
  return data_clean

## preprocess all

Generate preprocessor

In [27]:
# generate preprocessor function
def generate_preprocessor():
  """Function to generate preprocessor"""
  # load data 
  X = joblib.load('data/output/X_train.pkl')
  y = joblib.load('data/output/y_train.pkl')
  
  # drop unusual data
  X, y = clean_late_data(X, y)
  X, y = clean_unsecured_data(X, y)
  
  # Generate preprocessor: imputer
  constant_imputer, median_imputer = fit_imputer(data = X)
  X_imputed = transform_imputer(X, constant_imputer, median_imputer)
  
  # Generate preprocessor: standardizer
  standardizer = fit_standardize(X_imputed)
  
  # dump file = 
  preprocessor = {
    'constant_imputer' : constant_imputer,
    'median_imputer' : median_imputer,
    'standardizer' : standardizer
  }
  
  joblib.dump(preprocessor, 'data/output/preprocessor.pkl')
  
  return preprocessor

In [28]:
# running generate preprocessor function
preprocessor = generate_preprocessor()

X shape : (95838, 10)
y shape : (95838,)
X shape : (95350, 10)
y shape : (95350,)
data shape : (95350, 10)


# for train

In [29]:
# preprocess data function
def preprocess_data(type='train'):
  """Function to preprocess train data"""
  # load data
  X = joblib.load(f'data/output/X_{type}.pkl')
  y = joblib.load(f'data/output/y_{type}.pkl')
  
  # drop unusual data
  X, y = clean_late_data(X, y)
  X, y = clean_unsecured_data(X, y)
  
  # preprocess data = 
  X_clean = _preprocess_data(X)
  y_clean = y
  
  # Undersampling
  X_clean, y_clean = random_undersampler(X_clean, y_clean)
  
  # print shape
  print('X clean shape :', X_clean.shape)
  print('y clean shape :', y_clean.shape)
  
  joblib.dump(X_clean, f'data/output/X_{type}_clean.pkl')
  joblib.dump(y_clean, f'data/output/y_{type}_clean.pkl')
  
  return X_clean, y_clean

In [30]:
# transform X_train
X_train_clean, y_train_clean = preprocess_data(type='train')

X shape : (95838, 10)
y shape : (95838,)
X shape : (95350, 10)
y shape : (95350,)
data shape : (95350, 10)
Distribution before resampling :
0    89177
1     6173
Name: SeriousDlqin2yrs, dtype: int64
Distribution after resampling :
0    6173
1    6173
Name: SeriousDlqin2yrs, dtype: int64
X clean shape : (12346, 10)
y clean shape : (12346,)


In [31]:
# transform X_valid
X_valid_clean, y_valid_clean = preprocess_data(type='valid')

X shape : (23947, 10)
y shape : (23947,)
X shape : (23821, 10)
y shape : (23821,)
data shape : (23821, 10)
Distribution before resampling :
0    22235
1     1586
Name: SeriousDlqin2yrs, dtype: int64
Distribution after resampling :
0    1586
1    1586
Name: SeriousDlqin2yrs, dtype: int64
X clean shape : (3172, 10)
y clean shape : (3172,)


In [32]:
# transform X_test
X_test_clean, y_test_clean = preprocess_data(type='test')

X shape : (29946, 10)
y shape : (29946,)
X shape : (29790, 10)
y shape : (29790,)
data shape : (29790, 10)
Distribution before resampling :
0    27908
1     1882
Name: SeriousDlqin2yrs, dtype: int64
Distribution after resampling :
0    1882
1    1882
Name: SeriousDlqin2yrs, dtype: int64
X clean shape : (3764, 10)
y clean shape : (3764,)


In [33]:
# show list of X_train_clean column
X_train_clean.columns.tolist()

['RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']