# **1. Data Preparation**
---

## **1.1 Read Data**
---

In [27]:
# Import library
import pandas as pd

# Load configuration
import src.utils as utils

Create config file to load and dump data.

In [28]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [29]:
def read_data():
    """Load data and dump data"""

    # Load data
    data_path = CONFIG_DATA['raw_dataset_path']
    data = pd.read_csv(data_path)

    # Validate data shape
    print("Data shape       :", data.shape)

    # Pickle dumping (save the result)
    dump_path = CONFIG_DATA['dataset_path']
    utils.pickle_dump(data, dump_path)

    return data

In [30]:
# Check the function
data = read_data()
data.head()

Data shape       : (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


## **1.2 Sample Splitting**
---

- Split input & output data and dump them
- Update the config file to contain
    - The input & output data path
    - The output variable name
    - The input columns name

In [31]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [32]:
def split_input_output():
    """Split input (predictors) and output (responses)"""
    
    # Load data
    dataset_path = CONFIG_DATA['dataset_path']
    data = utils.pickle_load(dataset_path)

    # Define y
    response_variable = CONFIG_DATA['response_variable']
    y = data[response_variable]

    # Define X
    X = data.drop(columns = [response_variable],
                  axis = 1)
    
    # Validate the splitting
    print('y shape :', y.shape)
    print('X shape :', X.shape)

    # Dumping
    dump_path_predictors = CONFIG_DATA['predictors_set_path']
    utils.pickle_dump(X, dump_path_predictors)

    dump_path_response = CONFIG_DATA['response_set_path']    
    utils.pickle_dump(y, dump_path_response)
    
    return X, y

In [33]:
# Check the function
X, y = split_input_output()

y shape : (32581,)
X shape : (32581, 11)


In [34]:
X.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4


In [35]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: loan_status, dtype: int64

Next, split the training and testing set from each predictors (X) and response (y).
- Set `stratify = y` for splitting the sample with stratify, based on the proportion of response y.
- Set `test_size = 0.3` for holding 30% of the sample as a testing set.
- Set `random_state = 42` for reproducibility.

In [36]:
# Import library 
from sklearn.model_selection import train_test_split

Update the config file to have train & test data path and test size.

In [37]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [38]:
def split_train_test():
    """Split train & test, then dump the data"""
    
    # Load the X and y
    X = utils.pickle_load(CONFIG_DATA['predictors_set_path'])
    y = utils.pickle_load(CONFIG_DATA['response_set_path'])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify = y,
                                                        test_size = CONFIG_DATA['test_size'],
                                                        random_state = 42)
    # Validate splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump data
    utils.pickle_dump(X_train, CONFIG_DATA['train_path'][0])
    utils.pickle_dump(y_train, CONFIG_DATA['train_path'][1])
    utils.pickle_dump(X_test, CONFIG_DATA['test_path'][0])
    utils.pickle_dump(y_test, CONFIG_DATA['test_path'][1])

    return X_train, X_test, y_train, y_test

In [39]:
# Check the function
X_train, X_test, y_train, y_test = split_train_test()

X_train shape : (22806, 11)
y_train shape : (22806,)
X_test shape  : (9775, 11)
y_test shape  : (9775,)
