# **Initial Characteristic Analysis**
---

### **Characteristic Binning**

Create a function for binning the numerical predictors.

In [1]:
#import library
import pandas as pd
import numpy as np
import sys
sys.path.append("../src")
#load configuration
import utils

In [2]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': '../dataset/raw/credit_risk.csv',
 'dataset_path': '../dataset/output/data.pkl',
 'predictors_set_path': '../dataset/output/predictors.pkl',
 'response_set_path': '../dataset/output/response.pkl',
 'train_path': ['../dataset/output/X_train.pkl',
  '../dataset/output/y_train.pkl'],
 'test_path': ['../dataset/output/X_test.pkl', '../dataset/output/y_test.pkl'],
 'data_train_path': '../dataset/output/training_data.pkl',
 'data_train_binned_path': '../dataset/output/bin_training_data.pkl',
 'crosstab_list_path': '../dataset/output/list_crosstab.pkl',
 'WOE_table_path': '../dataset/output/WOE_table.pkl',
 'IV_table_path': '../dataset/output/IV_table.pkl',
 'WOE_map_dict_path': '../dataset/output/WOE_map_dict.pkl',
 'X_train_woe_path': '../dataset/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.2,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_c

In [3]:
#load the training data from a pickled file using the configuration data
data_train = utils.pickle_load(config_data['data_train_path'])

In [4]:
#display information about the training data
data_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
15884,25,241875,MORTGAGE,4.0,EDUCATION,16000,7.05,0.07,N,4,0
15138,21,18000,RENT,5.0,PERSONAL,1500,12.18,0.08,N,4,1
7474,25,53000,MORTGAGE,10.0,MEDICAL,16000,12.53,0.3,N,2,0
18212,28,16800,OWN,,MEDICAL,5000,13.98,0.3,N,8,1
6493,25,50000,MORTGAGE,2.0,VENTURE,10000,7.9,0.2,N,2,0


Create a function for binning the numerical predictors 

In [5]:
def create_binning(data, predictor_label, num_of_bins):
    """
    Bin a numerical predictor into the specified number of bins

    Parameters
    ----------
    data : DataFrame
        The DataFrame containing the data
    predictor_label : Str
        The label of the numerical predictor column
    num_of_bins : Int
        The number of bins to create
    
    Returns
    -------
    pd.DataFrame : The DataFrame with a new column containing the binned predictor values
    """
    # create a new column containing the binned predictor
    data[predictor_label + "_bin"] = pd.qcut(data[predictor_label],
                                             q = num_of_bins,
                                             duplicates='drop')
    
    return data


In [6]:
def bin_data(type):
    """Bin the numerical and missing data"""
    # Load the concatenated data
    data = utils.pickle_load(config_data[f'data_{type}_path'])

    # Bin the numerical columns
    num_columns = config_data['num_columns']
    num_of_bins = config_data['num_of_bins']

    for column in num_columns:
        data_binned = create_binning(data = data,
                                         predictor_label = column,
                                         num_of_bins = num_of_bins)

    # Bin missing values
    missing_columns = config_data['missing_columns']

    for column in missing_columns:
        # Add category 'Missing' to replace the missing values
        data_binned[column] = (data_binned[column]
                                    .cat
                                    .add_categories('Missing'))

        # Replace missing values with category 'Missing'
        data_binned[column].fillna(value = 'Missing',
                                   inplace = True)

    # Validate
    print(f"Original data shape : ", data.shape)
    print(f"Binned data shape  : ", data_binned.shape)

    # Dump binned data
    utils.pickle_dump(data_binned, config_data[f'data_{type}_binned_path'])
        
    return data_binned

In [7]:
# Check the function
binned_train = bin_data(type='train')
binned_train.T

Original data shape :  (26064, 18)
Binned data shape  :  (26064, 18)


Unnamed: 0,15884,15138,7474,18212,6493,22938,13030,20835,5115,1971,...,24380,30751,31368,1376,22095,14621,18736,1663,18257,17068
person_age,25,21,25,28,25,31,23,29,23,22,...,27,43,37,22,32,25,30,22,29,23
person_income,241875,18000,53000,16800,50000,60000,87000,42000,57500,24000,...,72000,54000,44196,35100,38000,98000,65000,20000,110000,30000
person_home_ownership,MORTGAGE,RENT,MORTGAGE,OWN,MORTGAGE,MORTGAGE,MORTGAGE,RENT,RENT,RENT,...,MORTGAGE,RENT,RENT,RENT,RENT,MORTGAGE,RENT,RENT,RENT,RENT
person_emp_length,4.0,5.0,10.0,,2.0,2.0,0.0,0.0,5.0,0.0,...,6.0,0.0,0.0,0.0,3.0,9.0,0.0,7.0,4.0,8.0
loan_intent,EDUCATION,PERSONAL,MEDICAL,MEDICAL,VENTURE,HOMEIMPROVEMENT,EDUCATION,HOMEIMPROVEMENT,EDUCATION,PERSONAL,...,EDUCATION,EDUCATION,MEDICAL,DEBTCONSOLIDATION,VENTURE,HOMEIMPROVEMENT,MEDICAL,EDUCATION,HOMEIMPROVEMENT,EDUCATION
loan_amnt,16000,1500,16000,5000,10000,9000,12000,5000,9000,2100,...,6000,8725,12400,1000,6350,25000,16000,1675,24000,10625
loan_int_rate,7.05,12.18,12.53,13.98,7.9,12.69,12.73,14.96,6.91,12.21,...,5.42,7.29,11.26,13.98,5.79,16.45,13.06,7.74,18.39,6.92
loan_percent_income,0.07,0.08,0.3,0.3,0.2,0.15,0.14,0.12,0.16,0.09,...,0.08,0.16,0.28,0.03,0.17,0.26,0.25,0.08,0.22,0.35
cb_person_default_on_file,N,N,N,N,N,N,Y,N,N,N,...,N,N,N,N,N,N,N,N,Y,N
cb_person_cred_hist_length,4,4,2,8,2,8,2,9,2,2,...,10,14,14,2,6,4,8,3,8,4


### **WoE and IV**

To assess the strength of each characteristic individually as a predictor of the credit performance. Create a contingency table/crosstab for all predictors: numerical and categorical predictors.

In [8]:
def create_crosstab_list():
    """Generate the crosstab list (contingency table) for WOE and IV calculation. Only in training data"""
    # load the binned train data
    data_train_binned = utils.pickle_load(config_data['data_train_binned_path'])

    # load the response variable (we will summarize based on the response variable)
    response_variable = config_data['response_variable']

    # iterate over numercial columns
    crosstab_num = []
    num_columns = config_data['num_columns']
    for column in num_columns:
        # Create a contingency table
        crosstab = pd.crosstab(data_train_binned[column + "_bin"],
                               data_train_binned[response_variable],
                               margins = True)

        # Append to the list
        crosstab_num.append(crosstab)

    # iterate over categorical columns
    crosstab_cat = []
    cat_columns = config_data['cat_columns']
    for column in cat_columns:
        # Create a contingency table
        crosstab = pd.crosstab(data_train_binned[column],
                               data_train_binned[response_variable],
                               margins = True)

        # Append to the list
        crosstab_cat.append(crosstab)

    # Put all two in a crosstab_list
    crosstab_list = crosstab_num + crosstab_cat

    # Validate the crosstab_list
    print('number of num bin : ', [bin.shape for bin in crosstab_num])
    print('number of cat bin : ', [bin.shape for bin in crosstab_cat])

    # Dump the result
    utils.pickle_dump(crosstab_list, config_data['crosstab_list_path'])

    return crosstab_list

In [9]:
# Check the function
crosstab_list = create_crosstab_list()
crosstab_list[0]

number of num bin :  [(6, 3), (6, 3), (7, 3), (6, 3), (7, 3), (6, 3), (6, 3)]
number of cat bin :  [(5, 3), (7, 3), (3, 3)]


loan_status,0,1,All
person_age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(19.999, 23.0]",5351,1686,7037
"(23.0, 25.0]",4134,1146,5280
"(25.0, 27.0]",2914,774,3688
"(27.0, 32.0]",4425,1141,5566
"(32.0, 144.0]",3554,939,4493
All,20378,5686,26064


In [10]:
crosstab_list[9]

loan_status,0,1,All
cb_person_default_on_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,17505,3948,21453
Y,2873,1738,4611
All,20378,5686,26064


In [11]:
def WOE_and_IV():
    """Get the WoE and IV"""
    # Load the crosstab list
    crosstab_list = utils.pickle_load(config_data['crosstab_list_path'])

    # Create initial storage for WoE and IV
    WOE_list, IV_list = [], []
    
    # Perform the calculation for all crosstab list
    for crosstab in crosstab_list:
        # Calcualte the WoE and IV
        crosstab['p_good'] = crosstab[0]/crosstab[0]['All']                                 # Calculate % Good
        crosstab['p_bad'] = crosstab[1]/crosstab[1]['All']                                  # Calculate % Bad
        crosstab['WOE'] = np.log(crosstab['p_good']/crosstab['p_bad'])                      # Calculate the WOE
        crosstab['contribution'] = (crosstab['p_good']-crosstab['p_bad'])*crosstab['WOE']   # Calculate the contribution value for IV
        IV = crosstab['contribution'][:-1].sum()                                            # Calculate the IV
        
        # Append to list
        WOE_list.append(crosstab)

        add_IV = {'Characteristic': crosstab.index.name, 
                  'Information Value': IV}
        IV_list.append(add_IV)


    # CREATE WOE TABLE
    # Create initial table to summarize the WOE values
    WOE_table = pd.DataFrame({'Characteristic': [],
                              'Attribute': [],
                              'WOE': []})
    for i in range(len(crosstab_list)):
        # Define crosstab and reset index
        crosstab = crosstab_list[i].reset_index()

        # Save the characteristic name
        char_name = crosstab.columns[0]

        # Only use two columns (Attribute name and its WOE value)
        # Drop the last row (average/total WOE)
        crosstab = crosstab.iloc[:-1, [0,-2]]
        crosstab.columns = ['Attribute', 'WOE']

        # Add the characteristic name in a column
        crosstab['Characteristic'] = char_name

        WOE_table = pd.concat((WOE_table, crosstab), 
                                axis = 0)

        # Reorder the column
        WOE_table.columns = ['Characteristic',
                            'Attribute',
                            'WOE']
    

    # CREATE IV TABLE
    # Create the initial table for IV
    IV_table = pd.DataFrame({'Characteristic': [],
                             'Information Value' : []})
    IV_table = pd.DataFrame(IV_list)

    # Define the predictive power of each characteristic
    strength = []

    # Assign the rule of thumb regarding IV
    for iv in IV_table['Information Value']:
        if iv < 0.02:
            strength.append('Unpredictive')
        elif iv >= 0.02 and iv < 0.1:
            strength.append('Weak')
        elif iv >= 0.1 and iv < 0.3:
            strength.append('Medium')
        else:
            strength.append('Strong')

    # Assign the strength to each characteristic
    IV_table = IV_table.assign(Strength = strength)

    # Sort the table by the IV values
    IV_table = IV_table.sort_values(by='Information Value')
    
    # Validate
    print('WOE table shape : ', WOE_table.shape)
    print('IV table shape  : ', IV_table.shape)

    # Dump data
    utils.pickle_dump(WOE_table, config_data['WOE_table_path'])
    utils.pickle_dump(IV_table, config_data['IV_table_path']) 

    return WOE_table, IV_table

In [12]:
# Check the function
WOE_table, IV_table = WOE_and_IV()

WOE table shape :  (49, 3)
IV table shape  :  (10, 3)


In [13]:
#display WOE table
WOE_table

Unnamed: 0,Characteristic,Attribute,WOE
0,person_age_bin,"(19.999, 23.0]",-0.121524
1,person_age_bin,"(23.0, 25.0]",0.006519
2,person_age_bin,"(25.0, 27.0]",0.049261
3,person_age_bin,"(27.0, 32.0]",0.078916
4,person_age_bin,"(32.0, 144.0]",0.054565
0,person_income_bin,"(3999.999, 35000.0]",-1.007237
1,person_income_bin,"(35000.0, 48996.0]",-0.052875
2,person_income_bin,"(48996.0, 63000.0]",0.175419
3,person_income_bin,"(63000.0, 86000.0]",0.492046
4,person_income_bin,"(86000.0, 6000000.0]",1.030836


In [14]:
#display IV table
IV_table

Unnamed: 0,Characteristic,Information Value,Strength
6,cb_person_cred_hist_length_bin,0.003927,Unpredictive
0,person_age_bin,0.006276,Unpredictive
2,person_emp_length_bin,0.059921,Weak
3,loan_amnt_bin,0.077772,Weak
8,loan_intent,0.088686,Weak
9,cb_person_default_on_file,0.16248,Medium
7,person_home_ownership,0.389711,Strong
1,person_income_bin,0.459059,Strong
4,loan_int_rate_bin,0.600325,Strong
5,loan_percent_income_bin,0.709054,Strong
