In [2]:
import pickle
import numpy as np
import pandas as pd
import woe.feature_process as fp
from sklearn.linear_model import LogisticRegression

# Load data and save indices of columns
df = pd.read_excel("../data/dev.xlsx",converters={'MostRecentApp_No_Grp':str})
df.head()

Unnamed: 0,AccountSerno,RemainingLoanPct,Rem2Month_flag_MonthsSince,OverCycle_amt_Min6mth,MostRecentApp_No,UC_NoInqMain,UC_LandVal,target,UC_NoInqMain_Grp,UC_NoInqMain_Grp_WoE,...,MostRecentApp_No_Grp,MostRecentApp_No_Grp_WoE,UC_LandVal_Grp,UC_LandVal_Grp_WoE,RemainingLoanPct_Grp,RemainingLoanPct_Grp_WoE,Rem2Month_flag_MonthsSince_Grp,Rem2Month_flag_MonthsSince_WoE,Prob,PD_Final
0,4363933,0.116151,,0.0,,0,,0,0,-0.733046,...,0,0.618054,Missing,-0.179594,<= 64,0.904385,6 Plus,0.129384,0.028847,0.028847
1,4364072,0.55832,,0.0,3.0,19,,0,12 to 20,-0.33578,...,3,-0.469498,Missing,-0.179594,<= 64,0.904385,6 Plus,0.129384,0.049521,0.049521
2,4367063,0.531817,,-1067.0,3.0,0,,0,0,-0.733046,...,3,-0.469498,Missing,-0.179594,<= 64,0.904385,6 Plus,0.129384,0.123856,0.123856
3,4367309,0.831918,,0.0,1.0,2,3911000.0,0,1 to 2,1.648371,...,1,0.303434,1841000 Plus,1.236862,65 to 93,0.234548,6 Plus,0.129384,0.009036,0.009036
4,4367814,0.745868,,0.0,1.0,13,,0,12 to 20,-0.33578,...,1,0.303434,Missing,-0.179594,65 to 93,0.234548,6 Plus,0.129384,0.048499,0.048499


In [5]:
# Define UC inq group, note this is an example for inadequate implementation, which later on will be corrected
#
# In this scenario, we created a grouped version of UC number of inquiries called UC_inq_group_demo
# which later on will be replaced by the original group variable UC_NoInqMain_Grp that gives a better
# risk separation
def inq_main_to_group(array):
    conditions = [
        (array == 0),
        (array <= 9) & (array >= 1),
        (array <= 20) & (array >= 10),
        (array > 20)]
    choices = ['0 - missing','1-9','10-20','21 and +']
    return np.select(conditions, choices, default='null')

df['UC_inq_group_demo'] = df['UC_NoInqMain'].apply(inq_main_to_group)

In [13]:
# define list of grouped variables and corresponding woe variable name and use woe package to calculate the value
var_list_to_woe = ['UC_inq_group_demo', 'OverCycle_amt_Min6mth_Grp', 'MostRecentApp_No_Grp', 
     'UC_LandVal_Grp', 'RemainingLoanPct_Grp', 'Rem2Month_flag_MonthsSince_Grp']
var_list_woe = ['uc_woe', 'pmt_woe', 'cust_app_woe', 'land_woe', 'remain_pctg_woe', 'rem2_past_woe']

def group_to_woe(df, group_var, global_bt, global_gt, min_sample=100, alpha=0.01):
    """
    This function is relying on woe package and will calculate woe for a grouped variable
    """
    split = fp.binning_data_split(df, group_var, global_bt, global_gt, min_sample, alpha)
    civ = fp.format_iv_split(df, group_var, split.split_point, global_bt, global_gt)
    woe_dict = dict(zip(split.split_point, civ.woe_list))
    
    return df[group_var].map(woe_dict), woe_dict

    
# define the total bad and good volume
total_bad = np.sum(df.target == 1)
total_good = np.sum(df.target == 0)

for i, var in enumerate(var_list_to_woe):
    df[var_list_woe[i]], woe_dict = group_to_woe(df, var, total_bad, total_good)
    print(woe_dict)
    pickle.dump(woe_dict, open('../app/data/dict_group2woe_' + var_list_woe[i] + '.pickle', 'wb'))

# check the woe calculation results
# for var in var_list_woe:
#     print(var)
#     print(df[var].unique())

{'0 - missing': 0.6961268728213668, '1-9': -0.6127859220020055, '10-20': 0.17368930607887662, '21 and +': 1.195972799034486}
{'-3816 to -993': 0.6744710631496393, '-993 to 0': 0.24140274050746496, '0': -0.4822625070131883, '<=-3816': 0.9704982607905294, '>0': -1.1568928959934086}
{'0': -0.6097767226308587, '1': -0.21019428068251236, '10 Plus': 1.7185817089837794, '2': 0.21681821274781463, '3': 0.4344220930634186, '4': 0.6179012922284162, '5': 0.7727896241684215, '6 to 9': 1.21483268516264}
{'0 to 441000': -0.33392519106425816, '1841000 Plus': -1.092201646087494, '441000 to 1841000': -0.8576286525741218, 'Missing': 0.17320345592851655}
{'65 to 93': -0.20863463252527298, '93 to 95': 0.005465474868573022, '95 to 96': 0.13694115189824885, '96 to 99': 0.23945013212357408, '<= 64': -0.9875748989745701, '>= 99': 0.5383245589807821}
{'0 to 2': 1.6552616908419517, '3 to 5': 0.8875655886133103, '6 Plus': -0.13418834281852918}


In [12]:
# store features
features = ['UC_NoInqMain', 'OverCycle_amt_Min6mth', 'MostRecentApp_No_Grp', 
     'UC_LandVal_Grp', 'RemainingLoanPct_Grp', 'Rem2Month_flag_MonthsSince_Grp']
pickle.dump(features, open('../app/data/list_features.pickle', 'wb'))

In [103]:
# Fit and save model
X, y = df[var_list_woe], df.loc[:, 'target']
lr = LogisticRegression(solver='liblinear').fit(X, y)
pickle.dump(lr, open('../app/data/model.pickle', 'wb'))

In [111]:
prob = lr.predict_proba(X)

df_to_val = pd.concat([X, y], axis=1, sort=False)
df_to_val['prob'] = prob[:,1]
# df_to_val.head()