### **3.1 Pre-processing Training Set**
---

- We preprocess the train set in this section by substituting WOE based on its bin for the values.
. The train woe dataset will be saved in a configuration file.

In [301]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

To have `WOE_map_dict_path`, update the configuration file.

In [302]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

In [303]:
# Mechanism for producing the WOE mapping dictionary
def get_woe_map_dict():
    """Obtain the WOE mapping directory."""
    # Load the WOE table
    WOE_table = utils.load_pickle(CONFIG_DATA['WOE_table_path'])

    # Set the dictionary to start.
    WOE_map_dict = {}
    WOE_map_dict['Missing'] = {}
    
    unique_char = set(WOE_table['Characteristic'])
    
    for char in unique_char:
        # Obtain the WOE and attribute information for each characteristic.
        current_data = (WOE_table
                            [WOE_table['Characteristic']==char]     # Utilize a characteristic-based filter
                            [['Attribute', 'WOE']])                 # Next, choose WOE and the attribute.
        
        # Get the mapping
        WOE_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            woe = current_data.loc[idx, 'WOE']

            if attribute == 'Missing':
                WOE_map_dict['Missing'][char] = woe
            else:
                WOE_map_dict[char][attribute] = woe
                WOE_map_dict['Missing'][char] = np.nan

    # Validation of data
    print('Number of key : ', len(WOE_map_dict.keys()))

    # Dump
    utils.dump_pickle(WOE_map_dict, CONFIG_DATA['WOE_map_dict_path'])

    return WOE_map_dict

In [304]:
WOE_map_dict = get_woe_map_dict()
WOE_map_dict

Number of key :  23


{'Missing': {'Delay_from_due_date_bin': nan,
  'Type_of_Loan': nan,
  'Annual_Income_bin': nan,
  'Num_Credit_Card_bin': 0.013484856124403253,
  'Num_of_Loan_bin': -0.19956724176059132,
  'Occupation': nan,
  'Changed_Credit_Limit_bin': nan,
  'Num_Bank_Accounts_bin': 0.6116690188345607,
  'Total_EMI_per_month_bin': nan,
  'Credit_Utilization_Ratio_bin': nan,
  'Num_of_Delayed_Payment_bin': 0.5292980214014331,
  'Credit_History_Age_bin': nan,
  'Monthly_Inhand_Salary_bin': nan,
  'Amount_invested_monthly_bin': nan,
  'Monthly_Balance_bin': nan,
  'Payment_Behaviour': nan,
  'Outstanding_Debt_bin': nan,
  'Num_Credit_Inquiries_bin': 0.0846122001399875,
  'Payment_of_Min_Amount': nan,
  'Age_bin': 0.12383291329326879,
  'Interest_Rate_bin': 0.1481256058623133,
  'Credit_Mix': nan},
 'Delay_from_due_date_bin': {Interval(-4.001, 10.0, closed='right'): 1.268629139623019,
  Interval(10.0, 19.0, closed='right'): 0.3544004036787608,
  Interval(19.0, 29.0, closed='right'): -0.4250815615923199,


In [305]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

In [306]:
# function to insert WOE values in place of the train set's raw data
def transform_woe(raw_data=None, type=None, CONFIG_DATA=None):
    """Substitute WOE for the data value."""
    # Load the numerical columns
    num_cols = CONFIG_DATA['num_columns']

    # Load the WOE_map_dict
    WOE_map_dict = utils.load_pickle(CONFIG_DATA['WOE_map_dict_path'])

    # In case type is not None, load the stored data.
    if type is not None:
        raw_data = utils.load_pickle(CONFIG_DATA[f'{type}_path'][0])

    # Map the data
    woe_data = raw_data.copy()
    for col in woe_data.columns:
        # Fix numerical columns
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col    

        woe_data[col] = woe_data[col].map(WOE_map_dict[map_col])

    # Check the data to see if any values are missing or outside of the range.
    for col in woe_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col 

        woe_data[col] = woe_data[col].fillna(value=WOE_map_dict['Missing'][map_col])
        woe_data['Monthly_Balance']=woe_data['Monthly_Balance'].fillna(value=0)

    # Validate
    print('Raw data shape : ', raw_data.shape)
    print('WOE data shape : ', woe_data.shape)

    # Dump data
    if type is not None:
        utils.dump_pickle(woe_data, CONFIG_DATA[f'X_{type}_woe_path'])

    return woe_data

In [307]:
X_train_woe = transform_woe(type='train', CONFIG_DATA=CONFIG_DATA)

Raw data shape :  (5129, 22)
WOE data shape :  (5129, 22)


In [308]:
X_train_woe.head(10)

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
4025,0.594385,0.047131,0.674363,0.712941,-0.787591,-0.058489,1.212605,0.570645,-0.029259,-0.425082,...,1.121693,2.077306,0.900469,0.009969,1.156643,1.320621,-0.077376,0.322014,-0.00081,-0.556256
386,-0.157248,-0.047678,-0.701343,0.035592,0.982208,-0.058489,-2.824109,0.570645,-0.384238,-0.425082,...,-0.752544,-0.063333,-2.474733,0.009969,-0.307136,-0.842997,-0.038378,-0.470435,-0.062497,0.140316
5446,0.594385,-0.002208,-0.149729,-0.169678,0.982208,0.920094,0.621486,0.676677,0.535771,1.268629,...,1.121693,2.077306,0.900469,-0.041616,0.525575,1.320621,-0.038378,0.13085,-0.062497,-0.364951
5391,-0.294877,0.00938,-0.701343,-0.649268,-1.289674,-2.922224,-0.649659,-1.478077,-0.384238,-0.425082,...,-0.752544,-1.764292,-2.474733,-0.088458,-1.453686,-0.842997,-0.074114,-0.015764,-0.009698,-0.556256
587,-0.294877,0.027206,-0.149729,-0.169678,-0.787591,-2.922224,-0.649659,-1.327419,0.033956,-0.425082,...,-0.752544,-1.764292,-2.474733,-0.041616,-1.453686,-0.842997,-0.077376,-0.015764,0.164655,0.140316
856,0.594385,0.033319,0.674363,0.712941,0.982208,-0.058489,1.212605,0.676677,0.937176,1.268629,...,0.940445,2.077306,0.900469,-0.041616,1.156643,1.320621,0.186069,0.322014,-0.009698,0.712941
3665,0.594385,0.033319,0.096058,0.035592,-1.289674,0.920094,0.621486,0.676677,0.535771,1.268629,...,0.940445,-0.063333,0.97992,0.009969,1.156643,1.320621,0.186069,-0.015764,-0.281009,0.140316
2522,-0.294877,0.033319,0.674363,0.712941,0.982208,0.920094,1.212605,0.676677,0.033956,0.3544,...,1.121693,2.077306,0.97992,0.009969,0.525575,1.320621,-0.074114,0.13085,0.247298,0.712941
627,-0.120788,0.064208,-0.701343,-0.649268,-1.289674,-0.058489,-2.824109,0.676677,0.033956,-1.982853,...,-0.752544,-1.764292,-2.474733,-0.088458,-0.307136,-0.842997,0.186069,-0.015764,-0.281009,-0.364951
3743,-0.157248,-0.045684,-0.701343,-0.649268,-1.289674,-2.922224,-0.649659,-1.327419,-0.029259,-0.425082,...,-0.752544,-1.764292,-2.474733,0.118215,-1.453686,-0.842997,0.186069,-0.470435,-0.009698,-0.364951
