In [49]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np 
import joblib
import os
import yaml

### Data Collection

In [50]:
data = pd.read_csv("data/gmsc.csv")

In [51]:
data.head()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [52]:
data = data.dropna()

In [53]:
data.describe()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0
mean,75026.012514,0.069486,5.899873,51.289792,0.381769,26.598777,6670.221,8.758475,0.211925,1.054519,0.187829,0.851832
std,43286.029117,0.25428,257.040685,14.426684,3.499234,424.446457,14384.67,5.172835,3.465276,1.149273,3.447901,1.148391
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37678.0,0.0,0.035084,40.0,0.0,0.143388,3400.0,5.0,0.0,0.0,0.0,0.0
50%,74969.0,0.0,0.177282,51.0,0.0,0.296023,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112494.0,0.0,0.579428,61.0,0.0,0.482559,8249.0,11.0,0.0,2.0,0.0,2.0
max,150000.0,1.0,50708.0,103.0,98.0,61106.5,3008750.0,58.0,98.0,54.0,98.0,20.0


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120269 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   ID                                    120269 non-null  int64  
 1   SeriousDlqin2yrs                      120269 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  120269 non-null  float64
 3   Age                                   120269 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  120269 non-null  int64  
 5   DebtRatio                             120269 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       120269 non-null  int64  
 8   NumberOfTimes90DaysLate               120269 non-null  int64  
 9   NumberRealEstateLoansOrLines          120269 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  120269 non-null  int64  
 11  

In [55]:
data

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
149994,149995,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [56]:
# simpan dataset yang telah digabungkan
joblib.dump(data, "data/processed/dataset.pkl")

['data/processed/dataset.pkl']

### Data Definition

In [57]:
# definisikan tipe data, range data serta penjelasan untuk tiap observasi (variabel)

In [58]:
data.describe()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0
mean,75026.012514,0.069486,5.899873,51.289792,0.381769,26.598777,6670.221,8.758475,0.211925,1.054519,0.187829,0.851832
std,43286.029117,0.25428,257.040685,14.426684,3.499234,424.446457,14384.67,5.172835,3.465276,1.149273,3.447901,1.148391
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37678.0,0.0,0.035084,40.0,0.0,0.143388,3400.0,5.0,0.0,0.0,0.0,0.0
50%,74969.0,0.0,0.177282,51.0,0.0,0.296023,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112494.0,0.0,0.579428,61.0,0.0,0.482559,8249.0,11.0,0.0,2.0,0.0,2.0
max,150000.0,1.0,50708.0,103.0,98.0,61106.5,3008750.0,58.0,98.0,54.0,98.0,20.0


SeriousDlqin2yrs         :
    [integer]
    [0,1]
    determine if a user is good 
    Person experienced 90 days past due delinquency or worse 
    
RevolvingUtilizationOfUnsecuredLines         :
    [float]
    [0 - 50.708]
    Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided     by the sum of credit limits
  
Age  :
    [integer]
    [0 - 109]
    Age of borrower in years
    
NumberOfTime30-59DaysPastDueNotWorse     :
    [integer]
    [0 - 98]
    Number of times borrower has been 30-59 days past due but no worse in the last 2 years.

DebtRatio   :
    [float] (percentage)
    [0 - 329664]
    Monthly debt payments, alimony,living costs divided by monthy gross income

MonthlyIncome  :
    [float]
    [0 - 3,008,750]
    monthly income of borrower

NumberOfOpenCreditLinesAndLoans  :
    [integer]
    [0 - 58]
    Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)

NumberOfTimes90DaysLate  :
    [integer]
    [0 - 98]
    Number of times borrower has been 90 days or more past due.

NumberRealEstateLoansOrLines  :
    [integer]
    [0 - 54]
    Number of mortgage and real estate loans including home equity lines of credit

NumberOfTime60-89DaysPastDueNotWorse  :
    [integer]
    [0 - 98]
    Number of times borrower has been 60-89 days past due but no worse in the last 2 years.

NumberOfDependents  :
    [integer]
    [0 - 20]
    Number of dependents in family excluding themselves (spouse, children etc.)

### Data Validation

##### Data Types

In [59]:
# cek tipe data
data.dtypes

ID                                        int64
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
Age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

#### Range

In [60]:
# pengecekan cakupan data menjadi kacau jika tipe data tidak sesuai
data.describe()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0
mean,75026.012514,0.069486,5.899873,51.289792,0.381769,26.598777,6670.221,8.758475,0.211925,1.054519,0.187829,0.851832
std,43286.029117,0.25428,257.040685,14.426684,3.499234,424.446457,14384.67,5.172835,3.465276,1.149273,3.447901,1.148391
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37678.0,0.0,0.035084,40.0,0.0,0.143388,3400.0,5.0,0.0,0.0,0.0,0.0
50%,74969.0,0.0,0.177282,51.0,0.0,0.296023,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112494.0,0.0,0.579428,61.0,0.0,0.482559,8249.0,11.0,0.0,2.0,0.0,2.0
max,150000.0,1.0,50708.0,103.0,98.0,61106.5,3008750.0,58.0,98.0,54.0,98.0,20.0


#### Dimensi Data

In [61]:
data.shape

(120269, 12)

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120269 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   ID                                    120269 non-null  int64  
 1   SeriousDlqin2yrs                      120269 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  120269 non-null  float64
 3   Age                                   120269 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  120269 non-null  int64  
 5   DebtRatio                             120269 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       120269 non-null  int64  
 8   NumberOfTimes90DaysLate               120269 non-null  int64  
 9   NumberRealEstateLoansOrLines          120269 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  120269 non-null  int64  
 11  

In [63]:
data

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
149994,149995,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [64]:
joblib.dump(data, "data/processed/dataset_clean.pkl")

['data/processed/dataset_clean.pkl']

In [65]:
data.columns

Index(['ID', 'SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'Age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

### Data Defense

In [66]:
def check_data(input_data, params):
    # check data types
    assert input_data.select_dtypes("float").columns.to_list() == params["float_columns"], "an error occurs in float column(s)."
    assert input_data.select_dtypes("int").columns.to_list() == params["int32_columns"], "an error occurs in int64 column(s)."

    # check range of data
    assert set(input_data.stasiun).issubset(set(params["range_stasiun"])), "an error occurs in stasiun range."
    assert input_data.pm10.between(params["range_pm10"][0], params["range_pm10"][1]).sum() == len(input_data), "an error occurs in pm10 range."
    assert input_data.pm25.between(params["range_pm25"][0], params["range_pm25"][1]).sum() == len(input_data), "an error occurs in pm25 range."
    assert input_data.so2.between(params["range_so2"][0], params["range_so2"][1]).sum() == len(input_data), "an error occurs in so2 range."
    assert input_data.co.between(params["range_co"][0], params["range_co"][1]).sum() == len(input_data), "an error occurs in co range."
    assert input_data.o3.between(params["range_o3"][0], params["range_o3"][1]).sum() == len(input_data), "an error occurs in o3 range."
    assert input_data.no2.between(params["range_no2"][0], params["range_no2"][1]).sum() == len(input_data), "an error occurs in no2 range."

### Data Splitting

In [67]:
predictors = ['RevolvingUtilizationOfUnsecuredLines', 'Age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
       'NumberOfDependents','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse']

In [68]:
# pisahkan data x dan y (x adalah fitur, y adalah label)
x = data[predictors].copy()
y = data.SeriousDlqin2yrs.copy()

In [69]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120269 entries, 0 to 149999
Data columns (total 10 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   RevolvingUtilizationOfUnsecuredLines  120269 non-null  float64
 1   Age                                   120269 non-null  int64  
 2   NumberOfTime30-59DaysPastDueNotWorse  120269 non-null  int64  
 3   DebtRatio                             120269 non-null  float64
 4   MonthlyIncome                         120269 non-null  float64
 5   NumberOfOpenCreditLinesAndLoans       120269 non-null  int64  
 6   NumberRealEstateLoansOrLines          120269 non-null  int64  
 7   NumberOfDependents                    120269 non-null  float64
 8   NumberOfTimes90DaysLate               120269 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  120269 non-null  int64  
dtypes: float64(4), int64(6)
memory usage: 10.1 MB


In [70]:
y.value_counts()

0    111912
1      8357
Name: SeriousDlqin2yrs, dtype: int64

In [71]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 42, stratify = y)

In [72]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.2, random_state = 42, stratify = y_test)

In [73]:
joblib.dump(x_train, "data/processed/x_train.pkl")
joblib.dump(y_train, "data/processed/y_train.pkl")
joblib.dump(x_valid, "data/processed/x_valid.pkl")
joblib.dump(y_valid, "data/processed/y_valid.pkl")
joblib.dump(x_test, "data/processed/x_test.pkl")
joblib.dump(y_test, "data/processed/y_test.pkl")

['data/processed/y_test.pkl']