In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import functools as F

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
#from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Imputer as SimpleImputer
from sklearn import metrics   #Additional scklearn functions
#from sklearn.grid_search import GridSearchCV #Perforing grid search
import xgboost as xgb
import shap
#Bayesian optimization for model parameters
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


# Any results you write to the current directory are saved as output.

**Standard Util Functions**

In [None]:
def dataImputation(orig_df):
    df = orig_df.copy()
    # make new columns indicating what will be imputed
    cols_with_missing = (col for col in df.columns if df[col].isnull().any())
    #for col in cols_with_missing:
    #    df[col + '_was_missing'] = df[col].isnull()
    # Imputation
    my_imputer = SimpleImputer()
    df = pd.DataFrame(my_imputer.fit_transform(df),columns=df.columns.values)
    
    
    print(df.columns)
    #print(orig_df.columns)
    return df

#dataImputation(application_train)


In [None]:
def plot_feature_importances(clf, X_train, y_train=None, 
                             top_n=10, figsize=(8,8), print_table=False, title="Feature Importances"):
    '''
    plot feature importances of a tree-based sklearn estimator
    
    Note: X_train and y_train are pandas DataFrames
    
    Note: Scikit-plot is a lovely package but I sometimes have issues
              1. flexibility/extendibility
              2. complicated models/datasets
          But for many situations Scikit-plot is the way to go
          see https://scikit-plot.readthedocs.io/en/latest/Quickstart.html
    
    Parameters
    ----------
        clf         (sklearn estimator) if not fitted, this routine will fit it
        
        X_train     (pandas DataFrame)
        
        y_train     (pandas DataFrame)  optional
                                        required only if clf has not already been fitted 
        
        top_n       (int)               Plot the top_n most-important features
                                        Default: 10
                                        
        figsize     ((int,int))         The physical size of the plot
                                        Default: (8,8)
        
        print_table (boolean)           If True, print out the table of feature importances
                                        Default: False
        
    Returns
    -------
        the pandas dataframe with the features and their importance
        
    Author
    ------
        George Fisher
    '''
    
    __name__ = "plot_feature_importances"
    
    import pandas as pd
    import numpy  as np
    import matplotlib.pyplot as plt
    
    from xgboost.core     import XGBoostError
    from lightgbm.sklearn import LightGBMError
    
    try: 
        if not hasattr(clf, 'feature_importances_'):
            clf.fit(X_train.values, y_train.values.ravel())

            if not hasattr(clf, 'feature_importances_'):
                raise AttributeError("{} does not have feature_importances_ attribute".
                                    format(clf.__class__.__name__))
                
    except (XGBoostError, LightGBMError, ValueError):
        clf.fit(X_train.values, y_train.values.ravel())
            
    feat_imp = pd.DataFrame({'importance':clf.feature_importances_})    
    feat_imp['feature'] = X_train.columns
    feat_imp.sort_values(by='importance', ascending=False, inplace=True)
    feat_imp = feat_imp.iloc[:top_n]
    
    feat_imp.sort_values(by='importance', inplace=True)
    feat_imp = feat_imp.set_index('feature', drop=True)
    feat_imp.plot.barh(title=title, figsize=figsize)
    plt.xlabel('Feature Importance Score')
    plt.show()
    
    if print_table:
        from IPython.display import display
        print("Top {} features in descending order of importance".format(top_n))
        display(feat_imp.sort_values(by='importance', ascending=False))
        
    return feat_imp

**Features Input**

In [None]:
train_final = pd.read_csv('../input/train_final.csv')
test_final  = pd.read_csv('../input/test_final.csv')

**Below Box is for Bayesian Optimization for Model Hyperparameter Tuning
Note- It is very resource and time consuming**

In [None]:
def kFoldValidation(train, features, xgbParams, numRounds, nFolds, target='TARGET'):  
   kf = KFold( n_splits = nFolds, shuffle = True).split(train)
   fold_score=[]
   for train_index, cv_index in kf:
      # split train/validation
      X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[cv_index]
      y_train, y_valid = (train[target].as_matrix()[train_index]), (train[target].as_matrix()[cv_index])
      dtrain = xgb.DMatrix(X_train, y_train) 
      dvalid = xgb.DMatrix(X_valid, y_valid)         
      watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
      gbm = xgb.train(xgbParams, dtrain, numRounds, evals = watchlist, early_stopping_rounds = 100)
      score = gbm.best_score
      fold_score.append(score)
   return np.mean(fold_score)

def xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample,scale_pos_weight,n_estimators):   
   # prepare xgb parameters 
   params = {
            "n_estimators": n_estimators,
            "objective": "binary:logistic",
            "booster" : "gbtree",
            "eval_metric": "auc",            
            "tree_method": 'auto',
            "silent": 1,
            "eta": eta, 
            "max_depth": int(maxDepth),
            "min_child_weight" : minChildWeight,
            "subsample": subsample, 
            "colsample_bytree": colSample,             
            "gamma": gamma,
           "scale_pos_weight": scale_pos_weight
   }
   cvScore = kFoldValidation(train, features, params, int(numRounds), nFolds = 3)
   print('CV score: {:.6f}'.format(cvScore)) 
   return -1.0 * cvScore   # invert the cv score to let bayopt maximize

def bayesOpt(train, features):
   ranges = {            
              'numRounds': (1000, 5000),
              'eta': (0.001, 0.3),
              'gamma': (0, 25),
              'maxDepth': (1, 10),
              'minChildWeight': (0, 10),
              'subsample': (0, 1),
              'colSample': (0, 1),
              'scale_pos_weight':(1,50),
              'n_estimators': (100,400)
   }   
   # proxy through a lambda to be able to pass train and features
   optFunc = lambda numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample, scale_pos_weight, n_estimators: xgbCv(train, features, numRounds, eta, gamma, maxDepth, minChildWeight, subsample, colSample,scale_pos_weight,n_estimators)
   bo = BayesianOptimization(optFunc, ranges)
   bo.maximize(init_points = 50, n_iter = 5, kappa = 2, acq = "ei", xi = 0.0)   
   bestMAE = round((-1.0 * bo.res['max']['max_val']), 6)
   print("\n Best auc found: %f" % bestMAE)
   print("\n Parameters: %s" % bo.res['max']['max_params'])

target="TARGET"

predictors = [x for x in train_final.columns.values if x not in [target,'SK_ID_CURR']]
print(predictors)
bayesOpt(train_final,predictors)

**Feature Selection**

In [None]:
'''This code will be commented when code will bw committed so that it runs faster'''
model_selection = xgb.XGBClassifier(silent=False, 
                      scale_pos_weight=15,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=300, 
                      reg_alpha = 1.2,
                      min_child_weight=1,    
                      max_delta_step=1,                         
                      max_depth=5,
                      eval_metric='auc',   
                      gamma=1,
                      early_stopping_rounds= 200)

from sklearn.feature_selection import SelectFromModel
train_selection = dataImputation(application_train)
predictors_selection = [x for x in train_selection.columns.values if x not in ['TARGET','SK_ID_CURR']]
embeded_lgb_selector = SelectFromModel(model_selection, threshold='1.25*median')
embeded_lgb_selector.fit(train_selection[predictors_selection],train_selection['TARGET'])
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = train_selection[predictors_selection].loc[:,embeded_lgb_support].columns.tolist()
print(embeded_lgb_feature)

**Model Training**

In [None]:
# Tried with multiple models, but selected XGBOOST as final model. Selected model parameters from above tuning
gmb0 = xgb.XGBClassifier(silent=False, 
                      scale_pos_weight=15,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=300, 
                      reg_alpha = 1.2,
                      min_child_weight=1,    
                      max_delta_step=1,                         
                      max_depth=5,
                      eval_metric='auc',   
                      gamma=1,
                      early_stopping_rounds= 200).fit(train_final[predictors], train_final[target])

**Evaluating Model  With Cross Validation**

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
model = xgb.XGBClassifier(silent=True, 
                      scale_pos_weight=15,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=300, 
                      reg_alpha = 1.2,
                      min_child_weight=1,    
                      max_delta_step=1,                         
                      max_depth=5,
                      eval_metric='auc',   
                      gamma=1,
                      early_stopping_rounds= 200)
kfold = StratifiedKFold(n_splits=5, random_state=7)
results = cross_val_score(model,train_final[predictors], train_final[target], cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

**Risk Score**: Assigning a risk against each customer who has applied for loan. The sore has range between 0 & 1, score towards 1 mean higher risk of default

In [None]:

'''Below Feature are selected from Feature selection module, 
if want to train on whole set of features then uncomment 1st line'''
#test_predictors = [x for x in test_final.columns.values if x not in ['SK_ID_CURR']]
test_predictors = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'LIVINGAREA_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'TOTALAREA_MODE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_3', 'NEW_CREDIT_TO_ANNUITY_RATIO', 'NEW_CREDIT_TO_GOODS_RATIO', 'ANNUITY_TO_INCOME', 'CREDIT_PER_PERS', 'CREDIT_PER_CHILD', 'CREDIT_PER_NO_CHILD', 'NEW_SOURCES_PROD', 'NEW_EXT_SOURCES_MEAN', 'NEW_SCORES_STD', 'NEW_CAR_TO_BIRTH_RATIO', 'NEW_CAR_TO_EMPLOY_RATIO', 'NEW_PHONE_TO_BIRTH_RATIO', 'NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER', 'NEW_CREDIT_TO_INCOME_RATIO', 'INCOME_BY_DAYS_EMP', 'AVG_EXT_SOURCE', 'DAYS_EMP_TO_BIRTH_RATIO', 'AMT_ANNUITY_TO_DAYS_EMP_RATIO', 'AMT_ANNUITY_TO_DAYS_BIRTH_RATIO', 'AMT_INC_TO_DAYS_BIRTH_RATIO', 'AMT_GOODS_PER_PERS', 'AMT_ANNUITY_PER_PERS', 'BUREAU_NET_SUM_POST_PROP', 'BUREAU_NET_PROD_POST_PROP', 'BUREAU_NET_SUM_AMT_POST_PROP', 'BUREAU_NET_PROD_AMT_POST_PROP', 'DAYS_DIFF_MEAN', 'BUREAU_DAY_CREDIT_BY_CNT_PROLONG_min', 'BUREAU_DAY_CREDIT_BY_CNT_PROLONG_max', 'BUREAU_DAY_CREDIT_BY_CNT_PROLONG_mean', 'BUREAU_DAY_CREDIT_BY_CNT_PROLONG_count', 'CNT_CREDIT_PROLONG_count', 'PREV_REFUSED_RATIO_CREDIT_APPLICATION', 'PREV_NOT_REFUSED_RATIO_CREDIT_APPLICATION', 'PREV_REFUSED_RATIO_ANNUITY_APPLICATION', 'PREV_NOT_REFUSED_RATIO_ANNUITY_APPLICATION', 'PREV_NOT_REFUSED_RATIO_AMT_GOODS_PRICE_CREDIT', 'PREV_REFUSED_RATIO_AMT_GOODS_PRICE_CREDIT', 'PREV_NOT_REFUSED_RATIO_GOODS_ANNUITY', 'PREV_REFUSED_RATIO_GOODS_ANNUITY', 'AMT_INSTALMENT_min', 'AMT_INSTALMENT_max', 'AMT_INSTALMENT_sum', 'AMT_INSTALMENT_mean', 'AMT_PAYMENT_min', 'AMT_PAYMENT_max', 'AMT_PAYMENT_sum', 'AMT_PAYMENT_mean', 'AMT_INSTALMENT_AMT_PAYMENT_DECAY_DIFF_min', 'AMT_INSTALMENT_AMT_PAYMENT_DECAY_DIFF_max', 'AMT_INSTALMENT_AMT_PAYMENT_DECAY_DIFF_sum', 'AMT_INSTALMENT_AMT_PAYMENT_DECAY_DIFF_mean', 'AMT_PAYMENT_DECAY_min', 'AMT_PAYMENT_DECAY_sum', 'DAYS_ENTRY_PAYMENT_min', 'DAYS_ENTRY_PAYMENT_max', 'DAYS_ENTRY_PAYMENT_sum', 'DAYS_ENTRY_PAYMENT_mean', 'DAYS_ENTRY_PAYMENT_var', 'DAYS_ENTRY_PAYMENT_std', 'DAYS_INSTALMENT_min', 'DAYS_INSTALMENT_max', 'DAYS_INSTALMENT_sum', 'DAYS_INSTALMENT_mean', 'DAYS_INSTALMENT_var', 'DAYS_INSTALMENT_std', 'NUM_INSTALMENT_VERSION_sum', 'NUM_INSTALMENT_VERSION_mean', 'NUM_INSTALMENT_VERSION_var', 'DAYS_INSTALMENT_DAYS_ENTRY_PAYMENT_DIFF_min', 'DAYS_INSTALMENT_DAYS_ENTRY_PAYMENT_DIFF_sum', 'DAYS_INSTALMENT_DAYS_ENTRY_PAYMENT_DIFF_mean', 'DAYS_INSTALMENT_DAYS_ENTRY_PAYMENT_DIFF_var', 'DAYS_INSTALMENT_DAYS_ENTRY_PAYMENT_DIFF_std', 'AMT_INSTALMENT_AMT_PAYMENT_DIFF_max', 'AMT_INSTALMENT_AMT_PAYMENT_DIFF_sum', 'AMT_INSTALMENT_AMT_PAYMENT_DIFF_mean', 'PREV_NON_CASH_TOTAL_DEBT_TO_AMT_CREDIT_min', 'PREV_NON_CASH_TOTAL_DEBT_TO_AMT_CREDIT_max', 'PREV_NON_CASH_TOTAL_DEBT_TO_AMT_CREDIT_sum', 'PREV_NON_CASH_TOTAL_DEBT_TO_AMT_CREDIT_mean', 'PREV_NON_CASH_TOTAL_DEBT_TO_AMT_CREDIT_var', 'PREV_NON_CASH_TOTAL_DEBT_TO_AMT_CREDIT_std', 'PREV_CASH_TOTAL_DEBT_TO_AMT_CREDIT_min', 'PREV_CASH_TOTAL_DEBT_TO_AMT_CREDIT_max', 'PREV_CASH_TOTAL_DEBT_TO_AMT_CREDIT_sum', 'PREV_CASH_TOTAL_DEBT_TO_AMT_CREDIT_mean', 'PREV_CASH_TOTAL_DEBT_TO_AMT_CREDIT_var', 'DAYS_DECISION_min', 'DAYS_DECISION_max', 'DAYS_DECISION_mean', 'DAYS_DECISION_var']
#test_predictors = ['FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_9', 'HOUSETYPE_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_11', 'FLAG_EMP_PHONE', 'FLAG_PHONE', 'REG_REGION_NOT_WORK_REGION', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_EMAIL', 'FLAG_DOCUMENT_14', 'REG_CITY_NOT_WORK_CITY', 'ELEVATORS_MEDI', 'CHILDREN_RATIO', 'ELEVATORS_MODE', 'FLAG_DOCUMENT_13', 'CNT_CHILDREN', 'NONLIVINGAPARTMENTS_MEDI', 'CNT_FAM_MEMBERS', 'ENTRANCES_MEDI', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'FLAG_OWN_REALTY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'FLOORSMAX_MEDI', 'FONDKAPREMONT_MODE', 'FLOORSMAX_MODE', 'ELEVATORS_AVG', 'FLAG_DOCUMENT_18', 'REGION_RATING_CLIENT', 'ENTRANCES_MODE', 'FLOORSMIN_MODE', 'FLOORSMIN_MEDI', 'CNT_NO_CHILD', 'OBS_30_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_16', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_MODE', 'YEARS_BUILD_MEDI', 'AMT_REQ_CREDIT_BUREAU_DAY', 'YEARS_BUILD_MODE', 'FLAG_OWN_CAR', 'REG_CITY_NOT_LIVE_CITY', 'NAME_HOUSING_TYPE', 'NAME_TYPE_SUITE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'LIVINGAPARTMENTS_MODE', 'FLOORSMIN_AVG', 'ENTRANCES_AVG', 'BUREAU_LOAN_TYPES', 'WALLSMATERIAL_MODE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'FLOORSMAX_AVG', 'NONLIVINGAREA_MEDI', 'APARTMENTS_MODE', 'APARTMENTS_MEDI', 'AMT_REQ_CREDIT_BUREAU_MON', 'LIVINGAPARTMENTS_MEDI', 'NAME_CONTRACT_TYPE', 'NAME_INCOME_TYPE', 'LANDAREA_MEDI', 'FLAG_DOCUMENT_3', 'BASEMENTAREA_MODE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'LIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_MODE', 'WEEKDAY_APPR_PROCESS_START', 'YEARS_BUILD_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'NONLIVINGAREA_AVG', 'LIVINGAREA_MODE', 'LIVINGAREA_MEDI', 'LANDAREA_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'BASEMENTAREA_MEDI', 'AMT_REQ_CREDIT_BUREAU_QRT', 'COMMONAREA_MODE', 'COMMONAREA_AVG', 'LIVINGAREA_AVG', 'FLAG_WORK_PHONE', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'COMMONAREA_MEDI', 'PREV_NFLAG_INSURED_SUM', 'YEARS_BEGINEXPLUATATION_MODE', 'APARTMENTS_AVG', 'LANDAREA_AVG', 'TOTALAREA_MODE', 'BASEMENTAREA_AVG', 'NEW_INC_PER_CHLD', 'NAME_FAMILY_STATUS', 'BUREAU_LOAN_COUNT', 'REGION_RATING_CLIENT_W_CITY', 'OCCUPATION_TYPE', 'DAYS_DIFF_STD', 'NEW_INCOME_PER_PERS', 'BUREAU_LOAN_PER_TYPE', 'HOUR_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'AMT_INCOME_TOTAL', 'CREDIT_PER_CHILD', 'NEW_CAR_TO_BIRTH_RATIO', 'NAME_EDUCATION_TYPE', 'REGION_POPULATION_RELATIVE', 'CODE_GENDER', 'DAYS_DIFF_MEAN', 'OWN_CAR_AGE', 'PREV_REFUSED_RATIO_ANNUITY_APPLICATION', 'DAYS_DIFF_RATIO', 'CREDIT_PER_PERS', 'NEW_PHONE_TO_BIRTH_RATIO', 'NEW_SCORES_STD', 'CREDIT_PER_NO_CHILD', 'AMT_CREDIT', 'DAYS_LAST_PHONE_CHANGE', 'NEW_CAR_TO_EMPLOY_RATIO', 'NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER', 'PREV_REFUSED_RATIO_CREDIT_APPLICATION', 'NEW_CREDIT_TO_INCOME_RATIO', 'NEW_CREDIT_TO_GOODS_RATIO', 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'AMT_GOODS_PRICE', 'DAYS_ID_PUBLISH', 'AMT_ANNUITY', 'NEW_SOURCES_PROD', 'PREV_NOT_REFUSED_RATIO_ANNUITY_APPLICATION', 'INCOME_BY_DAYS_EMP', 'ANNUITY_TO_INCOME', 'PREV_NOT_REFUSED_RATIO_CREDIT_APPLICATION', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'NEW_EXT_SOURCES_MEAN', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'NEW_CREDIT_TO_ANNUITY_RATIO']
score = np.array(gmb0.predict_proba(test_final[test_predictors]))
test_final['TARGET'] = score[:,0]
cust_with_score = test_final[['SK_ID_CURR','TARGET']]
cust_with_score.to_csv('submission_post5.csv',index=False)
