In [1]:
# import packages
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
import xgboost as xgb


pd.set_option('display.max_columns',200)

pd.set_option('display.max_rows',200)

train = pd.read_csv(r'train_s3TEQDk.csv')
test  = pd.read_csv(r'test_mSzZ8RL.csv')
# train = train[train['City_Code'].isin(['C1'])]

train.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


# Calculating WOE and IV score

In [2]:
max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

In [3]:

def data_vars(df1,target,test,cat_threshold):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    replace = {}
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                cat_replace = []
                conv = char_bin(target, df1[i])
                conv = conv.sort_values('WOE')
                similar_col,var_count = comb_category(conv,cat_threshold)
                cat_replace.append(similar_col)
                
                while var_count>0:
                    
                    for x,y in zip(similar_col.keys(),similar_col.values()):
                        df1.loc[df1[i].isin(y),i] = x
                        test.loc[test[i].isin(y),i] = x
                    conv = char_bin(target, df1[i])
                    conv = conv.sort_values('WOE')
                    similar_col,var_count = comb_category(conv,cat_threshold)
                    cat_replace.append(similar_col)
                replace[i] = cat_replace
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv,df1,test,replace)

In [4]:

def comb_category(woe,threshold):
    
    count = 0
    similar_col = dict()
    col = []
    columns1 = woe['MIN_VALUE'].unique()
    columns2 = woe['MIN_VALUE'].unique()
    for cat1 in columns1 :
        if cat1 in col: continue
        woe1 = float(woe[woe['MIN_VALUE'] == cat1]['WOE'].values[0])
        col1 = []

        for cat in woe['MIN_VALUE'].unique():
            if cat1 == cat: continue
            if cat in col: continue
            woe2 = float(woe[woe['MIN_VALUE'] == cat]['WOE'].values[0])

            if (woe2 - woe1) >0.0 and (woe2 - woe1)<threshold:
                col1.append(cat)
                col.append(cat)
        col.append(cat1)

        similar_col[cat1] = col1
        
        if len(col1)>0:
            count+=1
    
    return(similar_col,count)


In [5]:
train_copy = train.copy()
test_copy  = test.copy()

In [6]:
train_copy.drop('ID',axis =1,inplace = True)
train_copy['Credit_Product'].fillna('NA',inplace = True)

test_copy.drop('ID',axis =1,inplace = True)
test_copy['Credit_Product'].fillna('NA',inplace = True)

final_iv, IV,new_train,new_test,cat_replace = data_vars(train_copy,train_copy.Is_Lead,test_copy,cat_threshold = 0.1)

  result = getattr(ufunc, method)(*inputs, **kwargs)


# Replacing features with WOE values

In [7]:
def woe_replacement(train,transform_vars_list,transform_prefix):
    for var in transform_vars_list:
        print(var)
        
        small_train = final_iv[final_iv['VAR_NAME'] == var]
        transform_dict = dict(zip(small_train.MAX_VALUE,small_train.WOE))
        replace_cmd = ''
        replace_cmd1 = ''
        
        for i in sorted(transform_dict.items()):
            replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
            replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
        replace_cmd = replace_cmd + '0'
        replace_cmd1 = replace_cmd1 + '0'
        
        if replace_cmd != '0':
            try:
                train[transform_prefix + var] = train[var].apply(lambda x: eval(replace_cmd))
            except:
                train[transform_prefix + var] = train[var].apply(lambda x: eval(replace_cmd1))
                
    return(train)

In [8]:
transform_vars_list = new_train.columns.difference(['Is_Lead'])
transform_prefix = 'new_'

In [9]:
new_train = woe_replacement(new_train,transform_vars_list,transform_prefix)

Age
Avg_Account_Balance
Channel_Code
Credit_Product
Gender
Is_Active
Occupation
Region_Code
Vintage


In [10]:
transform_vars_list = new_test.columns

In [11]:
new_test = woe_replacement(new_test,transform_vars_list,transform_prefix)

Gender
Age
Region_Code
Occupation
Channel_Code
Vintage
Credit_Product
Avg_Account_Balance
Is_Active


In [12]:
new_train.columns

Index(['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code', 'Vintage',
       'Credit_Product', 'Avg_Account_Balance', 'Is_Active', 'Is_Lead',
       'new_Age', 'new_Avg_Account_Balance', 'new_Channel_Code',
       'new_Credit_Product', 'new_Gender', 'new_Is_Active', 'new_Occupation',
       'new_Region_Code', 'new_Vintage'],
      dtype='object')

In [13]:
new_train.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead,new_Age,new_Avg_Account_Balance,new_Channel_Code,new_Credit_Product,new_Gender,new_Is_Active,new_Occupation,new_Region_Code,new_Vintage
0,Female,73,RG268,Other,X3,43,No,1045696,No,0,0.498777,0.121814,0.623044,-1.364785,-0.196014,-0.168061,0.042326,0.324851,-0.542668
1,Female,30,RG254,Salaried,X1,32,No,581988,No,0,-1.356528,-0.176053,-1.12836,-1.364785,-0.196014,-0.168061,-0.492647,-0.074837,-0.542668
2,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0,0.498777,0.187896,0.623044,-1.364785,-0.196014,0.238257,0.203481,0.324851,-0.542668
3,Male,34,RG264,Salaried,X1,19,No,470454,No,0,0.332437,-0.327351,-1.12836,-1.364785,0.148701,-0.168061,-0.492647,-0.634452,-0.583654
4,Female,30,RG262,Salaried,X1,33,No,886787,No,0,-1.356528,-0.012826,-1.12836,-1.364785,-0.196014,-0.168061,-0.492647,-0.275504,-0.542668


# Modelling

In [64]:
predictors = ['Age','Avg_Account_Balance', 'new_Channel_Code',
       'new_Credit_Product', 'new_Is_Active', 'new_Occupation',
        'new_Vintage','Vintage','Credit_Product']

target = ['Is_Lead']


X = new_train[predictors]
y = new_train[target]

X_test = new_test[predictors]

In [65]:
X = pd.get_dummies(X , columns = ['Credit_Product'])
X_test = pd.get_dummies(X_test , columns = ['Credit_Product'])

In [51]:

from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample', criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=12, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False,
            random_state= 10,
            verbose=0, warm_start=False)
rdf.fit(X,y)

  rdf.fit(X,y)


RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy',
                       max_depth=10, min_samples_leaf=12, min_samples_split=5,
                       n_estimators=1000, n_jobs=-1, random_state=10)

In [53]:
result = pd.Series(rdf.predict_proba(X_test)[:, 1])

result = pd.concat([test['ID'],result],axis = 1).rename(columns = {0:'Is_Lead'})


In [110]:
feature_importances = []
feature_importances.append(rdf.feature_importances_)

feature_importances = [sum(x)/len(feature_importances) for x in zip(*feature_importances)]
feature_importance_matrix = pd.DataFrame({'Columns':predictors, 'f_imp' : feature_importances})

feature_importance_matrix.sort_values('f_imp')

Unnamed: 0,Columns,f_imp
15,Region_Code_RG265,0.000347
13,Region_Code_RG262,0.000473
14,Region_Code_RG264,0.000573
12,Region_Code_RG254,0.000745
11,Region_Code_RG250,0.000981
16,Region_Code_RG268,0.002411
1,Avg_Account_Balance,0.009778
4,new_Is_Active,0.017117
6,new_Vintage,0.035701
5,new_Occupation,0.03865
