# **3. Initial Characteristic Analysis**
---

Our objectives:
- Bin each predictor
- Obtain WOE for each bin (attribute) in characteristic
- Obtain IV for each characteristic

### **3.1 Characteristic Binning**
---

In [68]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

- We concat the predictors (X) & response (y) data for train set first.
- Update the config file to have those concated data path.

In [69]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [70]:
def concat_data(type):
    """Concat the input (X) & output (y) data"""
    X = utils.pickle_load(CONFIG_DATA[f'{type}_path'][0])
    y = utils.pickle_load(CONFIG_DATA[f'{type}_path'][1])
    
    # Concatenate X and y
    data = pd.concat((X, y),
                     axis = 1)

    # Validate data
    print(f'Data shape:', data.shape)

    # Dump concatenated data
    utils.pickle_dump(data, CONFIG_DATA[f'data_{type}_path'])
   
    return data

In [71]:
# Check the function for train data
data_train = concat_data(type='train')
data_train.head()

Data shape: (22806, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
11491,26,62000,RENT,1.0,DEBTCONSOLIDATION,B,10000,11.26,0.16,N,2,0
3890,23,39000,MORTGAGE,3.0,EDUCATION,C,5000,12.98,0.13,N,4,0
17344,24,35000,RENT,1.0,DEBTCONSOLIDATION,A,12000,6.54,0.34,N,2,1
13023,24,86000,RENT,1.0,HOMEIMPROVEMENT,B,12000,10.65,0.14,N,3,0
29565,42,38400,RENT,4.0,MEDICAL,B,13000,,0.34,N,11,1


- Then we bin the concatenated data.
- Categorical columns are already binned, thus we only create binning function for numerical columns.
- Update the config file to have:
    - The numerical column names
    - The categorical column names
    - The missing column names
    - The number of bins
    - The path for binned train set

In [72]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [73]:
# Create a function for binning the numerical predictor
def create_num_binning(data, predictor_label, num_of_bins):
    """Bin the numerical predictor"""
    # Create a new column containing the binned predictor
    data[predictor_label + "_bin"] = pd.qcut(data[predictor_label],
                                             q = num_of_bins)

    return data

In [74]:
def bin_data(type):
    """Bin the numerical and missing data"""
    # Load the concatenated data
    data = utils.pickle_load(CONFIG_DATA[f'data_{type}_path'])

    # Bin the numerical columns
    num_columns = CONFIG_DATA['num_columns']
    num_of_bins = CONFIG_DATA['num_of_bins']

    for column in num_columns:
        data_binned = create_num_binning(data = data,
                                         predictor_label = column,
                                         num_of_bins = num_of_bins)

    # Bin missing values
    missing_columns = CONFIG_DATA['missing_columns']

    for column in missing_columns:
        # Add category 'Missing' to replace the missing values
        data_binned[column] = (data_binned[column]
                                    .cat
                                    .add_categories('Missing'))

        # Replace missing values with category 'Missing'
        data_binned[column].fillna(value = 'Missing',
                                   inplace = True)

    # Validate
    print(f"Original data shape : ", data.shape)
    print(f"Binned data shape  : ", data_binned.shape)

    # Dump binned data
    utils.pickle_dump(data_binned, CONFIG_DATA[f'data_{type}_binned_path'])
        
    return data_binned

In [75]:
type(binned_train['person_emp_length_bin'].unique())

pandas.core.arrays.categorical.Categorical

In [76]:
# Check the function
binned_train = bin_data(type='train')
binned_train.head()

Original data shape :  (22806, 19)
Binned data shape  :  (22806, 19)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,person_age_bin,person_income_bin,person_emp_length_bin,loan_amnt_bin,loan_int_rate_bin,loan_percent_income_bin,cb_person_cred_hist_length_bin
11491,26,62000,RENT,1.0,DEBTCONSOLIDATION,B,10000,11.26,0.16,N,2,0,"(23.0, 26.0]","(55000.0, 79635.0]","(-0.001, 2.0]","(8000.0, 12250.0]","(10.99, 13.47]","(0.15, 0.23]","(1.999, 3.0]"
3890,23,39000,MORTGAGE,3.0,EDUCATION,C,5000,12.98,0.13,N,4,0,"(19.999, 23.0]","(38524.75, 55000.0]","(2.0, 4.0]","(499.999, 5000.0]","(10.99, 13.47]","(0.09, 0.15]","(3.0, 4.0]"
17344,24,35000,RENT,1.0,DEBTCONSOLIDATION,A,12000,6.54,0.34,N,2,1,"(23.0, 26.0]","(3999.999, 38524.75]","(-0.001, 2.0]","(8000.0, 12250.0]","(5.419, 7.9]","(0.23, 0.83]","(1.999, 3.0]"
13023,24,86000,RENT,1.0,HOMEIMPROVEMENT,B,12000,10.65,0.14,N,3,0,"(23.0, 26.0]","(79635.0, 6000000.0]","(-0.001, 2.0]","(8000.0, 12250.0]","(7.9, 10.99]","(0.09, 0.15]","(1.999, 3.0]"
29565,42,38400,RENT,4.0,MEDICAL,B,13000,,0.34,N,11,1,"(30.0, 144.0]","(3999.999, 38524.75]","(2.0, 4.0]","(12250.0, 35000.0]",Missing,"(0.23, 0.83]","(8.0, 30.0]"


### **3.2 WoE and IV**
---  

- To assess the strength of each characteristic individually as a predictor of the credit performance.
- Update the config file to have
    - crosstab list path
    - WOE table path
    - IV table path

In [77]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [78]:
def create_crosstab_list():
    """Generate the crosstab list (contingency table) for WOE and IV calculation. Only in training data"""
    # load the binned train data
    data_train_binned = utils.pickle_load(CONFIG_DATA['data_train_binned_path'])

    # load the response variable (we will summarize based on the response variable)
    response_variable = CONFIG_DATA['response_variable']

    # iterate over numercial columns
    crosstab_num = []
    num_columns = CONFIG_DATA['num_columns']
    for column in num_columns:
        # Create a contingency table
        crosstab = pd.crosstab(data_train_binned[column + "_bin"],
                               data_train_binned[response_variable],
                               margins = True)

        # Append to the list
        crosstab_num.append(crosstab)

    # iterate over categorical columns
    crosstab_cat = []
    cat_columns = CONFIG_DATA['cat_columns']
    for column in cat_columns:
        # Create a contingency table
        crosstab = pd.crosstab(data_train_binned[column],
                               data_train_binned[response_variable],
                               margins = True)

        # Append to the list
        crosstab_cat.append(crosstab)

    # Put all two in a crosstab_list
    crosstab_list = crosstab_num + crosstab_cat

    # Validate the crosstab_list
    print('number of num bin : ', [bin.shape for bin in crosstab_num])
    print('number of cat bin : ', [bin.shape for bin in crosstab_cat])

    # Dump the result
    utils.pickle_dump(crosstab_list, CONFIG_DATA['crosstab_list_path'])

    return crosstab_list


In [79]:
# Check the function
crosstab_list = create_crosstab_list()
crosstab_list[0]

number of num bin :  [(5, 3), (5, 3), (6, 3), (5, 3), (6, 3), (5, 3), (5, 3)]
number of cat bin :  [(5, 3), (7, 3), (8, 3), (3, 3)]


loan_status,0,1,All
person_age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(19.999, 23.0]",4671,1469,6140
"(23.0, 26.0]",5007,1361,6368
"(26.0, 30.0]",3856,1026,4882
"(30.0, 144.0]",4297,1119,5416
All,17831,4975,22806


In [80]:
crosstab_list[10]

loan_status,0,1,All
cb_person_default_on_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,15302,3456,18758
Y,2529,1519,4048
All,17831,4975,22806


In [81]:
def WOE_and_IV():
    """Get the WoE and IV"""
    # Load the crosstab list
    crosstab_list = utils.pickle_load(CONFIG_DATA['crosstab_list_path'])

    # Create initial storage for WoE and IV
    WOE_list, IV_list = [], []
    
    # Perform the calculation for all crosstab list
    for crosstab in crosstab_list:
        # Calcualte the WoE and IV
        crosstab['p_good'] = crosstab[0]/crosstab[0]['All']                                 # Calculate % Good
        crosstab['p_bad'] = crosstab[1]/crosstab[1]['All']                                  # Calculate % Bad
        crosstab['WOE'] = np.log(crosstab['p_good']/crosstab['p_bad'])                      # Calculate the WOE
        crosstab['contribution'] = (crosstab['p_good']-crosstab['p_bad'])*crosstab['WOE']   # Calculate the contribution value for IV
        IV = crosstab['contribution'][:-1].sum()                                            # Calculate the IV
        
        # Append to list
        WOE_list.append(crosstab)

        add_IV = {'Characteristic': crosstab.index.name, 
                  'Information Value': IV}
        IV_list.append(add_IV)


    # CREATE WOE TABLE
    # Create initial table to summarize the WOE values
    WOE_table = pd.DataFrame({'Characteristic': [],
                              'Attribute': [],
                              'WOE': []})
    for i in range(len(crosstab_list)):
        # Define crosstab and reset index
        crosstab = crosstab_list[i].reset_index()

        # Save the characteristic name
        char_name = crosstab.columns[0]

        # Only use two columns (Attribute name and its WOE value)
        # Drop the last row (average/total WOE)
        crosstab = crosstab.iloc[:-1, [0,-2]]
        crosstab.columns = ['Attribute', 'WOE']

        # Add the characteristic name in a column
        crosstab['Characteristic'] = char_name

        WOE_table = pd.concat((WOE_table, crosstab), 
                                axis = 0)

        # Reorder the column
        WOE_table.columns = ['Characteristic',
                            'Attribute',
                            'WOE']
    

    # CREATE IV TABLE
    # Create the initial table for IV
    IV_table = pd.DataFrame({'Characteristic': [],
                             'Information Value' : []})
    IV_table = pd.DataFrame(IV_list)

    # Define the predictive power of each characteristic
    strength = []

    # Assign the rule of thumb regarding IV
    for iv in IV_table['Information Value']:
        if iv < 0.02:
            strength.append('Unpredictive')
        elif iv >= 0.02 and iv < 0.1:
            strength.append('Weak')
        elif iv >= 0.1 and iv < 0.3:
            strength.append('Medium')
        else:
            strength.append('Strong')

    # Assign the strength to each characteristic
    IV_table = IV_table.assign(Strength = strength)

    # Sort the table by the IV values
    IV_table = IV_table.sort_values(by='Information Value')
    
    # Validate
    print('WOE table shape : ', WOE_table.shape)
    print('IV table shape  : ', IV_table.shape)

    # Dump data
    utils.pickle_dump(WOE_table, CONFIG_DATA['WOE_table_path'])
    utils.pickle_dump(IV_table, CONFIG_DATA['IV_table_path']) 

    return WOE_table, IV_table

In [82]:
# Check the function
WOE_table, IV_table = WOE_and_IV()

WOE table shape :  (49, 3)
IV table shape  :  (11, 3)


In [83]:
crosstab_list[0]

loan_status,0,1,All
person_age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(19.999, 23.0]",4671,1469,6140
"(23.0, 26.0]",5007,1361,6368
"(26.0, 30.0]",3856,1026,4882
"(30.0, 144.0]",4297,1119,5416
All,17831,4975,22806


In [84]:
WOE_table.head(10)

Unnamed: 0,Characteristic,Attribute,WOE
0,person_age_bin,"(19.999, 23.0]",-0.119722
1,person_age_bin,"(23.0, 26.0]",0.026104
2,person_age_bin,"(26.0, 30.0]",0.047449
3,person_age_bin,"(30.0, 144.0]",0.068969
0,person_income_bin,"(3999.999, 38524.75]",-0.862418
1,person_income_bin,"(38524.75, 55000.0]",0.024109
2,person_income_bin,"(55000.0, 79635.0]",0.31118
3,person_income_bin,"(79635.0, 6000000.0]",1.026844
0,person_emp_length_bin,"(-0.001, 2.0]",-0.28411
1,person_emp_length_bin,"(2.0, 4.0]",0.065751


In [85]:
pd.set_option('display.max_rows', None)

In [86]:
WOE_table['Attribute'].dtypes

dtype('O')

In [87]:
IV_table

Unnamed: 0,Characteristic,Information Value,Strength
6,cb_person_cred_hist_length_bin,0.003737,Unpredictive
0,person_age_bin,0.005761,Unpredictive
2,person_emp_length_bin,0.064186,Weak
3,loan_amnt_bin,0.076986,Weak
8,loan_intent,0.088282,Weak
10,cb_person_default_on_file,0.159914,Medium
7,person_home_ownership,0.386431,Strong
1,person_income_bin,0.441132,Strong
4,loan_int_rate_bin,0.507604,Strong
5,loan_percent_income_bin,0.60665,Strong
