## **4.1 Pre-processing Training Set**
---

- In this part, we preprocess the train set by replacing the values with WOE based from its bin.
- We will save the train woe dataset to config file.

In [1]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

Update the config file to have `WOE_map_dict_path`.

In [4]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_grade',
  'cb_person_default_on_file'],
 'missing_col

In [7]:
# Function to generate the WOE mapping dictionary
def get_woe_map_dict():
    """Get the WOE mapping dictionary"""
    # Load the WOE table
    WOE_table = utils.pickle_load(CONFIG_DATA['WOE_table_path'])

    # Initialize the dictionary
    WOE_map_dict = {}
    WOE_map_dict['Missing'] = {}
    
    unique_char = set(WOE_table['Characteristic'])
    for char in unique_char:
        # Get the Attribute & WOE info for each characteristics
        current_data = (WOE_table
                            [WOE_table['Characteristic']==char]     # Filter based on characteristic
                            [['Attribute', 'WOE']])                 # Then select the attribute & WOE
        
        # Get the mapping
        WOE_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            woe = current_data.loc[idx, 'WOE']

            if attribute == 'Missing':
                WOE_map_dict['Missing'][char] = woe
            else:
                WOE_map_dict[char][attribute] = woe
                WOE_map_dict['Missing'][char] = np.nan

    # Validate data
    print('Number of key : ', len(WOE_map_dict.keys()))

    # Dump
    utils.pickle_dump(WOE_map_dict, CONFIG_DATA['WOE_map_dict_path'])

    return WOE_map_dict
    

In [8]:
WOE_map_dict = get_woe_map_dict()
WOE_map_dict

Number of key :  12


{'Missing': {'loan_grade': nan,
  'cb_person_default_on_file': nan,
  'person_emp_length_bin': -0.4610580341267352,
  'loan_amnt_bin': nan,
  'person_age_bin': nan,
  'cb_person_cred_hist_length_bin': nan,
  'loan_percent_income_bin': nan,
  'loan_int_rate_bin': -0.005541156447753005,
  'person_income_bin': nan,
  'loan_intent': nan,
  'person_home_ownership': nan},
 'loan_grade': {'A': 0.9045844152327798,
  'B': 0.34313734184050115,
  'C': 0.09211531308112622,
  'D': -1.6427314469269856,
  'E': -1.860619952885072,
  'F': -2.1967177761366448,
  'G': -4.940074791071},
 'cb_person_default_on_file': {'N': 0.21135854292649753,
  'Y': -0.7667414008671611},
 'person_emp_length_bin': {Interval(-0.001, 2.0, closed='right'): -0.2841104601156244,
  Interval(2.0, 4.0, closed='right'): 0.06575069926209322,
  Interval(4.0, 7.0, closed='right'): 0.19589613412112988,
  Interval(7.0, 123.0, closed='right'): 0.3188670280025152},
 'loan_amnt_bin': {Interval(499.999, 5000.0, closed='right'): 0.0696975576

In [9]:
WOE_map_dict['Missing']

{'loan_grade': nan,
 'cb_person_default_on_file': nan,
 'person_emp_length_bin': -0.4610580341267352,
 'loan_amnt_bin': nan,
 'person_age_bin': nan,
 'cb_person_cred_hist_length_bin': nan,
 'loan_percent_income_bin': nan,
 'loan_int_rate_bin': -0.005541156447753005,
 'person_income_bin': nan,
 'loan_intent': nan,
 'person_home_ownership': nan}

- Next, transform the inputed data based on the map dictionary above.
- Update the config file to have the path for the new data contains the WOE values.

In [10]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [16]:
# Function to replace the raw data in the train set with WOE values
def transform_woe(raw_data=None, type=None, CONFIG_DATA=None):
    """Replace data value with WOE"""
    # Load the numerical columns
    num_cols = CONFIG_DATA['num_columns']

    # Load the WOE_map_dict
    WOE_map_dict = utils.pickle_load(CONFIG_DATA['WOE_map_dict_path'])

    # Load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(CONFIG_DATA[f'{type}_path'][0])

    # Map the data
    woe_data = raw_data.copy()
    for col in woe_data.columns:
        # Perbaiki kolom numerik
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col    

        woe_data[col] = woe_data[col].map(WOE_map_dict[map_col])

    # Map the data if there is a missing value or out of range value
    for col in woe_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col 

        woe_data[col] = woe_data[col].fillna(value=WOE_map_dict['Missing'][map_col])

    # Validate
    print('Raw data shape : ', raw_data.shape)
    print('WOE data shape : ', woe_data.shape)

    # Dump data
    if type is not None:
        utils.pickle_dump(woe_data, CONFIG_DATA[f'X_{type}_woe_path'])

    return woe_data

In [17]:
# Transform the train set
X_train_woe = transform_woe(type='train', CONFIG_DATA=CONFIG_DATA)

Raw data shape :  (22806, 11)
WOE data shape :  (22806, 11)


In [18]:
X_train_woe.head(10)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
11491,0.026104,0.31118,-0.506161,-0.28411,-0.323869,0.343137,0.121353,0.161476,0.184567,0.211359,-0.070043
3890,-0.119722,0.024109,0.665394,0.065751,0.281765,0.092115,0.069698,0.161476,0.663435,0.211359,0.000751
17344,0.026104,-0.862418,-0.506161,-0.28411,-0.323869,0.904584,0.121353,0.965882,-1.137536,0.211359,-0.070043
13023,0.026104,1.026844,-0.506161,-0.28411,-0.227074,0.343137,0.121353,0.45406,0.663435,0.211359,-0.070043
29565,0.068969,-0.862418,-0.506161,0.065751,-0.278307,0.343137,-0.419296,-0.005541,-1.137536,0.211359,0.028436
22677,0.047449,0.024109,-0.506161,-0.28411,0.281765,0.904584,0.340733,0.965882,0.184567,0.211359,0.028436
25029,0.047449,1.026844,0.665394,0.318867,0.281765,0.343137,0.340733,0.161476,0.743839,0.211359,0.028436
23700,0.047449,0.31118,-0.506161,0.065751,-0.278307,0.343137,0.121353,0.45406,0.663435,0.211359,0.028436
4299,-0.119722,0.024109,-0.506161,0.065751,0.281765,-1.642731,0.069698,-0.005541,0.663435,-0.766741,0.000751
29314,0.068969,0.31118,-0.506161,0.318867,0.45623,0.343137,-0.419296,0.45406,-1.137536,0.211359,0.028436
