## Data Science NIgeria AIBootcamp Hackathon 2020

## Task: Predict customers who will default on a loan.

## Import Libraries

In [1]:
# Data Processing and Visualization Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Data Modelling Libraries
# Classifiers
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import (cross_val_score, train_test_split, StratifiedKFold)
from sklearn.preprocessing import StandardScaler

import warnings

# set maximum rows and columns to display
pd.options.display.max_columns = 100
pd.options.display.max_rows = 200

# Silence pesky deprecation warnings from sklearn
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

## Load Datasets

In [2]:
# load training and test datasets
churn_train = pd.read_csv("train.csv")
churn_test = pd.read_csv("test.csv")

# concat these two datasets, this will come handy while processing the data
churn = pd.concat([churn_train, churn_test], ignore_index=True)

### Quick Data Exploration

In [3]:
churn.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,form_field11,form_field12,form_field13,form_field14,form_field15,form_field16,form_field17,form_field18,form_field19,form_field20,form_field21,form_field22,form_field23,form_field24,form_field25,form_field26,form_field27,form_field28,form_field29,form_field30,form_field31,form_field32,form_field33,form_field34,form_field35,form_field36,form_field37,form_field38,form_field39,form_field40,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,11333126.0,4397256.0,2301324.0,7875694.0,96375139,112181765.0,1.0,1.0,1.0,3.0,3.0,134.655,,,134.655,4000.5,17064.0,11376.0,60.0,17064.0,5962.0,,1052.0,8.0,2.0,,12.0,12.0,4.0,0.0,,,0.392854,2.02,0.711632,0.0,0.0,charge,,1.129518,0.044335,no
1,Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,2533168.0,244292.0,3982971.0,2141445.0,28141427,45169902.0,1.0,1.0,1.0,2.0,2.0,109.23,11385.24,14584.29,86.685,4161.0,13413.0,8942.0,304.0,13413.0,5780.0,,2554.0,17.5,0.0,0.0,16.0,24.0,8.0,0.0,,,0.314281,8.08,0.183584,,0.0,charge,349.80573,1.620483,0.322436,no
2,Apcnt_1000008,3276.0,0.53845,3.151,0.0,6.282,,956940.0,,192944.0,1079864.0,,,0.0,16576457,22452691.0,,,,0.0,0.0,,,,,1597.5,4746.0,3164.0,60.0,15696.0,,14508.0,70.0,0.6666,0.0,,12.0,0.0,6.0,0.0,149.6805,,0.162965,18.18,0.791136,0.0,0.0,charge,,1.51337,0.01164,yes
3,Apcnt_1000012,3372.0,0.17005,0.505,0.0,0.0,192166.0,3044703.0,385499.0,3986472.0,3621979.0,,771776.0,1186839.0,48187375,136297820.0,0.0,0.0,0.0,1.0,1.0,120.12,8580.0,0.0,85.8,1701.0,4014.0,2676.0,60.0,11679.0,,,522.0,7.8334,1.0,1.0,4.0,8.0,4.0,1.0,,,0.488884,2.02,0.685168,,0.0,charge,89.9401,0.664452,0.082729,no
4,Apcnt_1000016,3370.0,0.7727,1.101,0.0,0.0,1556.0,214728.0,214728.0,1284089.0,361770.0,396391.0,818456.0,511535.0,28141427,15292757.0,0.0,0.0,0.0,3.0,3.0,104.535,8360.19,10326.45,115.335,1551.0,3285.0,2190.0,60.0,19437.0,12958.0,5202.0,272.0,1.0,0.0,0.0,24.0,2.0,24.0,0.0,150.0135,,0.275,12.12,0.438168,0.0,0.0,charge,97.887502,1.427891,0.04563,no


In [4]:
churn_test.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,form_field11,form_field12,form_field13,form_field14,form_field15,form_field16,form_field17,form_field18,form_field19,form_field20,form_field21,form_field22,form_field23,form_field24,form_field25,form_field26,form_field27,form_field28,form_field29,form_field30,form_field31,form_field32,form_field33,form_field34,form_field35,form_field36,form_field37,form_field38,form_field39,form_field40,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
0,Apcnt_1000032,3236.0,0.34875,10.2006,0.0,0.0,418564.0,418564.0,418564.0,540710.0,0.0,1649749.0,5446.0,0.0,48187375,114686147.0,0.0,0.0,0.0,0.0,0.0,,,,,1086.0,6204.0,2676.0,60.0,14691.0,9794.0,,238.0,1.8334,0.0,,22.0,2.0,14.0,0.0,,,0.825,1.01,0.8,,0.0,charge,,0.0,0.011221
1,Apcnt_1000048,3284.0,1.2736,2.9606,9.0198,0.0,0.0,9858816.0,49014.0,1510098.0,18308285.0,622789.0,1025793.0,1396510.0,34694910,87479487.0,1.0,1.0,1.0,2.0,2.0,142.56,,,570.585,3082.5,10221.0,6814.0,60.0,11955.0,3528.0,,1216.0,19.5,3.0,,30.0,18.0,14.0,2.0,,18.8415,0.507694,4.04,0.623248,1.0,0.0,lending,,0.504974,0.043525
2,Apcnt_1000052,,0.27505,0.06,0.0,0.0,,,,,0.0,118256.0,,,21973443,,,,,0.0,0.0,,,,,,,,852.0,4836.0,,,,7.8334,0.0,,2.0,,0.0,,,,,0.0,,,,charge,,0.0,
3,Apcnt_1000076,3232.0,0.28505,2.8032,0.0,0.0,0.0,473802.0,473802.0,1724437.0,493641.0,1391064.0,1176725.0,1220293.0,16576457,,2.0,2.0,2.0,3.0,3.0,109.005,13050.0,,109.005,1414.5,4197.0,2798.0,60.0,14964.0,,,248.0,9.0,1.0,1.0,36.0,0.0,26.0,0.0,,,0.916663,2.02,0.464224,,,charge,90.163742,0.788809,0.104029
4,Apcnt_1000080,3466.0,2.09545,0.8318,2.5182,0.0,19839.0,1150662.0,1150662.0,7860523.0,5752921.0,271133.0,613064.0,4779643.0,48187375,72049802.0,0.0,0.0,0.0,3.0,3.0,139.065,469.29,469.29,15.18,2724.0,9855.0,6570.0,182.0,11679.0,5900.0,,2388.0,16.6666,1.0,0.0,24.0,28.0,20.0,0.0,,,0.234047,23.23,0.726688,0.0,0.0,lending,1303.587148,1.637733,0.163124


In [5]:
print(churn_train.shape)
print(churn_test.shape)
print(churn.shape)

(56000, 52)
(24000, 51)
(80000, 52)


## Missing Data

In [6]:
def missing(df):
    """get the number and percentage of missing values in each columns
    """
    null_num = df.isnull().sum()
    null_percent = round(df.isnull().sum() / df.shape[0] * 100, 2)
    return pd.DataFrame([null_num, null_percent], index = ['Total' , 'Percent']).T

missing(churn)

Unnamed: 0,Total,Percent
Applicant_ID,0.0,0.0
form_field1,3639.0,4.55
form_field2,5553.0,6.94
form_field3,501.0,0.63
form_field4,501.0,0.63
form_field5,501.0,0.63
form_field6,18964.0,23.7
form_field7,7394.0,9.24
form_field8,18964.0,23.7
form_field9,11408.0,14.26


There are features that have as much as 70% missng values. Ordinarily, I would have dropped columns with more than 70% missing values, but based on an earlier analysis (not included in this notebook, for the sake of brevity), it turns out that for this particular project, its more effective to fill all missing values in the daaset with -999; as our machine learning model yields much more accurate result with this value.

In [7]:
#fill all missing values with -999
churn_999 = churn.fillna(value=-999)

#drop the applicant_ID
churn_999.drop(columns = ['Applicant_ID'], axis=1, inplace=True)


# separately store customers ID in test datasets, 
# this will come in handy at the end of the task when saving predictions.
test_applicantID = churn_test['Applicant_ID']

In [8]:
# convert fields with categorical data to numerical data
churn_numerical = churn_999.replace({ 'form_field47': {'charge': 0, 'lending': 1}, 'default_status': {'no': 0, 'yes': 1}})

## Data Modeling

In [9]:
# separate data into training and test set
train = churn_numerical.iloc[:len(churn_train)].copy()
test = churn_numerical.iloc[len(churn_train):].copy()
test.drop(labels=['default_status'], axis=1, inplace=True)

# separate train features and train labels
Y_train = train['default_status'].astype(int)
X_train = train.drop(labels=['default_status'], axis=1)
X_test = test

# Scale features such that the mean is 0 and standard deviation is 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

KFold = StratifiedKFold(n_splits=10)

In [10]:
# Initiate CatBoostClassifier
ctb = CatBoostClassifier(iterations=695, learning_rate=0.02, loss_function='Logloss',
                         random_seed=42, eval_metric='AUC', verbose=False)

# Fit model
ctb.fit(X_train, Y_train, verbose=False)

#check accuracy of trained model
scores = cross_val_score(ctb, X_train, Y_train, cv=KFold, n_jobs=4, scoring='roc_auc')

print(scores)
round(np.mean(scores)*100, 2)

[0.83128994 0.84724055 0.84103121 0.83454031 0.84385609 0.84172451
 0.81948471 0.84139714 0.85664103 0.84534659]


84.03

In [11]:
# Initiate LightbootClassifier
lgbm = LGBMClassifier(boosting_type='gbdt', objective='binary', n_estimators= 3935, learning_rate= 0.01,
                              max_bin= 55, num_leaves= 20, eval_metric= 'auc', bagging_fraction= 0.8, bagging_freq= 5, 
                              feature_fraction= 0.23, feature_fraction_seed= 9, bagging_seed=9,
                              min_sum_hessian_in_leaf= 11, tree_learner= 'data'
                      )

lgbm.fit(X_train, Y_train)


#check accuracy of trained model
scores = cross_val_score(lgbm, X_train, Y_train, cv=KFold, n_jobs=4, scoring='roc_auc')

print(scores)
round(np.mean(scores)*100, 2)

[0.83418769 0.84631902 0.83910916 0.83522891 0.84235504 0.84219479
 0.81974519 0.84194637 0.85516434 0.84739544]


84.04

In [12]:
ensemble_name = 'Ensemble - Catboost and LBGM'

ensemble_clf = VotingClassifier(estimators=[('CTB', ctb), ('LGBM', lgbm)], weights=[1,2], voting='soft')
trained_model_ensemble = ensemble_clf.fit(X_train, Y_train.values.ravel())

#Get predicted probabilities for each class
pred_prob_ensemble = ensemble_clf.predict_proba(X_test)[:,1]

scores = cross_val_score(ensemble_clf, X_train, Y_train, cv=KFold, n_jobs=-1, scoring='roc_auc')

print(scores)
mean_scores = round(np.mean(scores)*100, 3)
mean_scores

[0.8344326  0.84775401 0.84071231 0.83604386 0.8438911  0.84324016
 0.82060541 0.84271196 0.8568329  0.84776521]


84.14

### Save Predictions

In [13]:
submission = pd.DataFrame()
submission['Applicant_ID'] = test_applicantID
submission['default_status'] = pred_prob_ensemble

submission.to_csv('submission.csv', index=False)

In [14]:
submission.head()

Unnamed: 0,Applicant_ID,default_status
0,Apcnt_1000032,0.293666
1,Apcnt_1000048,0.348152
2,Apcnt_1000052,0.392349
3,Apcnt_1000076,0.75383
4,Apcnt_1000080,0.14403


###### Author: ADERONMU AYOMIDE