## **3. Initial Characteristic Analysis**
---

Our goals are:
- Bin every predictor 
- Find the weighted average of each bin (attribute) in the characteristic
- Acquire IV for every attribute

#### **3.1 Characteristic Binning**
---

In [322]:
# Import library
import pandas as pd
import numpy as np

# Load the data configuration
import src.utils as utils

- For the train set, we first concat the predictors (X) & target (y) data.
- Modify the configuration file to include those concatenated data paths.

In [323]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

In [324]:
def data_concat(type):
    """Function to concat the input (X) & output (y) data"""
    X = utils.load_pickle(CONFIG_DATA[f'{type}_path'][0])
    y = utils.load_pickle(CONFIG_DATA[f'{type}_path'][1])
    
    # Concatenate data X and y
    data = pd.concat((X, y),
                     axis = 1)

    # Validation dataset
    print(f'Shape of data:', data.shape)

    # Dump concatenated data
    utils.dump_pickle(data, CONFIG_DATA[f'data_{type}_path'])
   
    return data

In [325]:
data_train = data_concat(type='train')
data_train.head()

Shape of data: (5129, 23)


Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
4025,54.0,Doctor,114597.04,9366.753333,7.0,6.0,4.0,4.0,Student Loan,28,6.0,10.54,0.0,Good,926.18,28.277461,33,No,225.923762,774.659073,Low_spent_Large_value_payments,206.092498,0
386,37.0,Engineer,18721.3,1684.108333,4.0,5.0,28.0,3.0,Mortgage Loan,21,11.0,7.26,5.0,Standard,2646.59,29.393327,16,Yes,46.593546,47.968644,High_spent_Small_value_payments,333.848643,0
5446,53.0,Manager,41675.64,3451.97,4.0,3.0,12.0,2.0,Personal Loan,6,10.0,4.9,2.0,Good,1181.64,32.50945,28,No,53.796807,239.400348,High_spent_Small_value_payments,311.999845,1
5391,17.0,Teacher,17291.27,1226.939167,10.0,9.0,20.0,8.0,Mortgage Loan,26,15.0,-0.14,8.0,Poor,4231.86,23.815207,15,Yes,105.151792,82.676235,Low_spent_Medium_value_payments,214.865889,0
587,16.0,Architect,63149.2,5121.433333,7.0,10.0,19.0,6.0,Auto Loan,20,16.0,27.02,7.0,Poor,3656.46,34.456617,8,Yes,233.602177,119.261482,!@9#%8,419.279674,0


- After that, we bin the combined data.
- Since numerical columns already have a binning function, categorical columns do not require one.
- Modify the configuration file so that:
    - The names of the numerical columns 
    - The names of the categorical columns 
    - The names of the missing columns
    - The quantity of bins
    - The binned train set's path

In [326]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

In [327]:
# Make a function that allows the numerical predictor to be binned.
def define_num_binning(data, predictor_label, num_of_bins):
    """Create function to binning the numerical predictor"""
    # Make a new column with the binned predictor in it.
    data[predictor_label + "_bin"] = pd.qcut(data[predictor_label],
                                             q = num_of_bins)

    return data

In [328]:
def data_bin(type):
    """Create function to binning the numerical and missing data"""
    # Load the concatenated data
    data = utils.load_pickle(CONFIG_DATA[f'data_{type}_path'])

    # Bin the numerical columns
    num_columns = CONFIG_DATA['num_columns']
    num_of_bins = CONFIG_DATA['num_of_bins']

    for column in num_columns:
        data_binned = define_num_binning(data = data,
                                         predictor_label = column,
                                         num_of_bins = num_of_bins)

    # Bin missing values
    missing_columns = CONFIG_DATA['missing_columns']

    for column in missing_columns:
        # Incorporate the 'Missing' category to substitute the absent values.
        data_binned[column] = (data_binned[column]
                                    .cat
                                    .add_categories('Missing'))

        # Replace missing values with category 'Missing'
        data_binned[column].fillna(value = 'Missing',
                                   inplace = True)

    # Validate
    print(f"The original data shape : ", data.shape)
    print(f"The binned data shape  : ", data_binned.shape)

    # Dump binned data
    utils.dump_pickle(data_binned, CONFIG_DATA[f'data_{type}_binned_path'])
        
    return data_binned

In [329]:
train_binned = data_bin(type='train')
train_binned.head()

The original data shape :  (5129, 40)
The binned data shape  :  (5129, 40)


Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Age_bin,Annual_Income_bin,Num_of_Loan_bin,Num_of_Delayed_Payment_bin,Outstanding_Debt_bin,Monthly_Inhand_Salary_bin,Num_Credit_Inquiries_bin,Credit_Utilization_Ratio_bin,Total_EMI_per_month_bin,Num_Bank_Accounts_bin,Num_Credit_Card_bin,Interest_Rate_bin,Delay_from_due_date_bin,Amount_invested_monthly_bin,Monthly_Balance_bin,Changed_Credit_Limit_bin,Credit_History_Age_bin
4025,54.0,Doctor,114597.04,9366.753333,7.0,6.0,4.0,4.0,Student Loan,28,6.0,10.54,0.0,Good,926.18,28.277461,33,No,225.923762,774.659073,Low_spent_Large_value_payments,206.092498,0,"(41.0, 55.0]","(70112.78, 23658189.0]","(2.0, 4.0]","(-3.001, 9.0]","(603.87, 1237.12]","(5796.608, 15167.18]","(-0.001, 2.0]","(28.142, 32.225]","(161.488, 82122.0]","(6.0, 7.0]","(4.0, 6.0]","(0.999, 8.0]","(19.0, 29.0]","(247.885, 10000.0]","(0.907, 267.848]","(9.74, 15.44]","(28.0, 41.0]"
386,37.0,Engineer,18721.3,1684.108333,4.0,5.0,28.0,3.0,Mortgage Loan,21,11.0,7.26,5.0,Standard,2646.59,29.393327,16,Yes,46.593546,47.968644,High_spent_Small_value_payments,333.848643,0,"(33.0, 41.0]","(7005.929, 18921.78]","(2.0, 4.0]","(9.0, 14.0]","(2094.49, 4998.07]","(1587.629, 3012.665]","(4.0, 8.0]","(28.142, 32.225]","(39.269, 75.585]","(-0.001, 4.0]","(4.0, 6.0]","(21.0, 34.0]","(19.0, 29.0]","(-0.001, 71.415]","(330.162, 454.617]","(5.78, 9.74]","(15.0, 21.0]"
5446,53.0,Manager,41675.64,3451.97,4.0,3.0,12.0,2.0,Personal Loan,6,10.0,4.9,2.0,Good,1181.64,32.50945,28,No,53.796807,239.400348,High_spent_Small_value_payments,311.999845,1,"(41.0, 55.0]","(36346.13, 70112.78]","(0.999, 2.0]","(9.0, 14.0]","(603.87, 1237.12]","(3012.665, 5796.608]","(-0.001, 2.0]","(32.225, 36.298]","(39.269, 75.585]","(-0.001, 4.0]","(0.999, 4.0]","(8.0, 15.0]","(-4.001, 10.0]","(129.42, 247.885]","(267.848, 330.162]","(-6.391, 5.78]","(21.0, 28.0]"
5391,17.0,Teacher,17291.27,1226.939167,10.0,9.0,20.0,8.0,Mortgage Loan,26,15.0,-0.14,8.0,Poor,4231.86,23.815207,15,Yes,105.151792,82.676235,Low_spent_Medium_value_payments,214.865889,0,"(13.999, 24.0]","(7005.929, 18921.78]","(6.0, 10.0]","(14.0, 18.0]","(2094.49, 4998.07]","(319.555, 1587.629]","(4.0, 8.0]","(21.705, 28.142]","(75.585, 161.488]","(7.0, 10.0]","(7.0, 10.0]","(15.0, 21.0]","(19.0, 29.0]","(71.415, 129.42]","(0.907, 267.848]","(-6.391, 5.78]","(-0.001, 15.0]"
587,16.0,Architect,63149.2,5121.433333,7.0,10.0,19.0,6.0,Auto Loan,20,16.0,27.02,7.0,Poor,3656.46,34.456617,8,Yes,233.602177,119.261482,!@9#%8,419.279674,0,"(13.999, 24.0]","(36346.13, 70112.78]","(4.0, 6.0]","(14.0, 18.0]","(2094.49, 4998.07]","(3012.665, 5796.608]","(4.0, 8.0]","(32.225, 36.298]","(161.488, 82122.0]","(6.0, 7.0]","(7.0, 10.0]","(15.0, 21.0]","(19.0, 29.0]","(71.415, 129.42]","(330.162, 454.617]","(15.44, 34.81]","(-0.001, 15.0]"


In [330]:
train_binned.isnull().sum()

Age                             108
Occupation                        0
Annual_Income                     0
Monthly_Inhand_Salary             0
Num_Bank_Accounts                76
Num_Credit_Card                 115
Interest_Rate                   111
Num_of_Loan                      30
Type_of_Loan                      0
Delay_from_due_date               0
Num_of_Delayed_Payment           33
Changed_Credit_Limit              0
Num_Credit_Inquiries             69
Credit_Mix                        0
Outstanding_Debt                  0
Credit_Utilization_Ratio          0
Credit_History_Age                0
Payment_of_Min_Amount             0
Total_EMI_per_month               0
Amount_invested_monthly           0
Payment_Behaviour                 0
Monthly_Balance                   0
Credit_Score                      0
Age_bin                           0
Annual_Income_bin                 0
Num_of_Loan_bin                   0
Num_of_Delayed_Payment_bin        0
Outstanding_Debt_bin        

### **3.2 WoE and IV**
---  

- To evaluate each trait's strength separately as a predictor of credit performance.
    - Modify the configuration file so that
    - Path for a crosstab list
    - Table path for WOE - Table path for IV

In [331]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

In [332]:
def define_crosstab_list():
    """Create the crosstab list (contingency table) needed for the computation of WOE and IV. In training data only"""
    # load the binned train data
    data_train_binned = utils.load_pickle(CONFIG_DATA['data_train_binned_path'])

    # the target variable is loaded (The target variable will be used to summarize.)
    target_variable = CONFIG_DATA['target_variable']

    # Repeat over the numerical columns.
    crosstab_num = []
    num_columns = CONFIG_DATA['num_columns']
    for column in num_columns:
        # Establish a contigency table.
        crosstab = pd.crosstab(data_train_binned[column + "_bin"],
                               data_train_binned[target_variable],
                               margins = True)

        # Append to the list
        crosstab_num.append(crosstab)

    # Repeat with the category columns.
    crosstab_cat = []
    cat_columns = CONFIG_DATA['cat_columns']
    for column in cat_columns:
        # Establish a contigency table.
        crosstab = pd.crosstab(data_train_binned[column],
                               data_train_binned[target_variable],
                               margins = True)

        # Append to the list
        crosstab_cat.append(crosstab)

    # Put all two in a crosstab_list
    crosstab_list = crosstab_num + crosstab_cat

    # Validate the crosstab_list
    print('Count of num bin : ', [bin.shape for bin in crosstab_num])
    print('Count of cat bin : ', [bin.shape for bin in crosstab_cat])

    # Dump the result
    utils.dump_pickle(crosstab_list, CONFIG_DATA['crosstab_list_path'])

    return crosstab_list

In [333]:
crosstab_list = define_crosstab_list()
crosstab_list[0]

Count of num bin :  [(6, 3), (5, 3), (6, 3), (6, 3), (5, 3), (5, 3), (6, 3), (5, 3), (5, 3), (6, 3), (6, 3), (6, 3), (5, 3), (5, 3), (5, 3), (5, 3), (5, 3)]
Count of cat bin :  [(17, 3), (9, 3), (4, 3), (4, 3), (8, 3)]


Credit_Score,0,1,All
Age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(13.999, 24.0]",874,460,1334
"(24.0, 33.0]",803,503,1306
"(33.0, 41.0]",755,456,1211
"(41.0, 55.0]",513,657,1170
Missing,60,48,108
All,3005,2124,5129


In [334]:
crosstab_list[10]

Credit_Score,0,1,All
Num_Credit_Card_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0.999, 4.0]",557,988,1545
"(4.0, 6.0]",1059,706,1765
"(6.0, 7.0]",507,351,858
"(7.0, 10.0]",815,31,846
Missing,67,48,115
All,3005,2124,5129


In [337]:
def WOE_and_IV():
    """Obtain the IV and WoE."""
    # Load the crosstab list
    crosstab_list = utils.load_pickle(CONFIG_DATA['crosstab_list_path'])

    # Establish initial IV and WoE storage
    WOE_list, IV_list = [], []
    
    # Execute the computation for every crosstab list.
    for crosstab in crosstab_list:
        # Determine the WoE and IV.
        crosstab['p_good'] = crosstab[1]/crosstab[1]['All']                                 # Calculate % Good
        crosstab['p_bad'] = crosstab[0]/crosstab[0]['All']                                  # Calculate % Bad
        crosstab['WOE'] = np.log(crosstab['p_good']/crosstab['p_bad'])                      # Calculate the WOE
        crosstab['contribution'] = (crosstab['p_good']-crosstab['p_bad'])*crosstab['WOE']   # Calculate the contribution value for IV
        IV = crosstab['contribution'][:-1].sum()                                            # Calculate the IV
        
        # Append to list
        WOE_list.append(crosstab)

        add_IV = {'Characteristic': crosstab.index.name, 
                  'Information Value': IV}
        IV_list.append(add_IV)


    # CREATE WOE TABLE
    # Make the first table outlining the WOE values.
    WOE_table = pd.DataFrame({'Characteristic': [],
                              'Attribute': [],
                              'WOE': []})
    for i in range(len(crosstab_list)):
        # Crosstab definition and index reset
        crosstab = crosstab_list[i].reset_index()

        # Keep the trait name saved.
        char_name = crosstab.columns[0]

        # Utilize just the attribute name and its WOE value in two columns.
        # Remove the final row (average/total WOE).
        crosstab = crosstab.iloc[:-1, [0,-2]]
        crosstab.columns = ['Attribute', 'WOE']

        # Put the name of the characteristic in a column.
        crosstab['Characteristic'] = char_name

        WOE_table = pd.concat((WOE_table, crosstab), 
                                axis = 0)

        # Reposition the column.
        WOE_table.columns = ['Characteristic',
                            'Attribute',
                            'WOE']
    

    # CREATE IV TABLE
    # Make the IV initial table.
    IV_table = pd.DataFrame({'Characteristic': [],
                             'Information Value' : []})
    IV_table = pd.DataFrame(IV_list)

    # Describe each characteristic's capacity for prediction.
    strength = []

    # Assign the rule of thumb regarding IV
    for iv in IV_table['Information Value']:
        if iv < 0.02:
            strength.append('Unpredictive')
        elif iv >= 0.02 and iv < 0.1:
            strength.append('Weak')
        elif iv >= 0.1 and iv < 0.3:
            strength.append('Medium')
        else:
            strength.append('Strong')

    # Assign each characteristic a strength.
    IV_table = IV_table.assign(Strength = strength)

    # Table sorted according to IV values
    IV_table = IV_table.sort_values(by='Information Value')
    
    # Validation
    print('WOE table shape : ', WOE_table.shape)
    print('IV table shape  : ', IV_table.shape)

    # Dump data
    utils.dump_pickle(WOE_table, CONFIG_DATA['WOE_table_path'])
    utils.dump_pickle(IV_table, CONFIG_DATA['IV_table_path']) 

    return WOE_table, IV_table

In [338]:
WOE_table, IV_table = WOE_and_IV()

WOE table shape :  (112, 3)
IV table shape  :  (22, 3)


In [None]:
WOE_table

Unnamed: 0,Characteristic,Attribute,WOE
0,Age_bin,"(13.999, 24.0]",-0.291272
1,Age_bin,"(24.0, 33.0]",-0.160496
2,Age_bin,"(33.0, 41.0]",-0.189201
3,Age_bin,"(41.0, 57.0]",0.658328
4,Age_bin,Missing,0.084612
0,Annual_Income_bin,"(7005.929, 18976.46]",-0.758924
1,Annual_Income_bin,"(18976.46, 35991.91]",0.08337
2,Annual_Income_bin,"(35991.91, 70256.94]",-0.074114
3,Annual_Income_bin,"(70256.94, 23658189.0]",0.658361
0,Num_of_Loan_bin,"(0.999, 2.0]",0.656399


In [339]:
IV_table

Unnamed: 0,Characteristic,Information Value,Strength
7,Credit_Utilization_Ratio_bin,0.005922,Unpredictive
17,Occupation,0.007241,Unpredictive
8,Total_EMI_per_month_bin,0.011992,Unpredictive
21,Payment_Behaviour,0.033472,Weak
13,Amount_invested_monthly_bin,0.083092,Weak
0,Age_bin,0.114035,Medium
18,Type_of_Loan,0.116137,Medium
5,Monthly_Inhand_Salary_bin,0.233616,Medium
1,Annual_Income_bin,0.235637,Medium
14,Monthly_Balance_bin,0.238552,Medium
