In [101]:
# import needed modules
import sys
from pathlib import Path 

project_root = Path().resolve().parent 
sys.path.insert(0, str(project_root)) # set sys.path to the root folder so we can import module from other directories

from src.utils import load_data, split_input_output, split_train_test, serialize_data, deserialize_data

# **1. Load Data**

In [102]:
# define file name constant variable
FNAME = str(project_root/'data'/'raw'/'credit_risk_dataset.csv')
data = load_data(fname=FNAME)
data.head()

Data Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


# **2. Split Input Output**

In [103]:
# define target col variable
TARGET_COL = 'loan_status'

X, y = split_input_output(data=data, target_col=TARGET_COL)

Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)


In [104]:
# sanity check on input data
X.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4


In [105]:
# sanity check on output data
y.head()

0    1
1    0
2    1
3    1
4    1
Name: loan_status, dtype: int64

# **3. Split Train Test**

In [106]:
# generate X_train and X_not_train data
X_train, X_not_train, y_train, y_not_train = split_train_test(
    X=X,
    y=y,
    test_size=0.2,
    random_state=42
)

X train shape: (26064, 11)
X test shape: (6517, 11)
y train shape: (26064,)
y test shape: (6517,)


In [107]:
# generate X_valid and X_test data
X_valid, X_test, y_valid, y_test = split_train_test(
    X=X_not_train,
    y=y_not_train,
    test_size=0.5,
    random_state=42
)

X train shape: (3258, 11)
X test shape: (3259, 11)
y train shape: (3258,)
y test shape: (3259,)


# **4. Serialize data**

In [108]:
# execute serialize function
data_dict = {
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test,
    'X_valid': X_valid,
    'y_valid': y_valid
}

for name, data in data_dict.items():
    serialize_data(data=data, path=str(project_root/'data'/'interim'/f'{name}.pkl'))

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_train.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/y_train.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_test.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/y_test.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_valid.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/y_valid.pkl



# **5. Deserialize data**

In [109]:
# execute deserialize func to load test data
data = deserialize_data(path=str(project_root/'data'/'interim'/'X_test.pkl'))
data.head()

Load object. . .
/home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_test.pkl has been successfully loaded!.


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
20439,30,31000,RENT,7.0,MEDICAL,D,4750,14.84,0.15,N,9
30809,36,60000,RENT,3.0,EDUCATION,A,9500,7.14,0.16,N,12
22978,29,33000,RENT,5.0,HOMEIMPROVEMENT,A,8000,5.79,0.24,N,8
19435,27,40000,RENT,3.0,PERSONAL,A,3000,7.05,0.07,N,8
31011,39,56000,RENT,16.0,DEBTCONSOLIDATION,C,10000,13.06,0.18,N,11
