In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.ensemble import ExtraTreesClassifier
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Read the input CSV data set
data_csv = "data/train.csv"
data = pd.read_csv(data_csv)

#Print the data set
data.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


In [3]:
data['Date.of.Birth'] = pd.to_datetime(data['Date.of.Birth'])
data['DisbursalDate'] = pd.to_datetime(data['DisbursalDate'])
data['AgeAtDisbursal'] = (data['DisbursalDate'] - data['Date.of.Birth']).astype('<m8[Y]')

X = pd.get_dummies(data["Employment.Type"], prefix="Employment.Type", drop_first=False)
data = pd.concat([data, X], axis=1)

index = 0
prev = 0
for i in range(10, 901, 20):
    data.loc[(data['PERFORM_CNS.SCORE'] < i) & (data['PERFORM_CNS.SCORE'] >= prev), 'PERFORM_CNS.SCORE'] = index
    index = index + 1
    prev = i

X = pd.get_dummies(data["PERFORM_CNS.SCORE"], prefix="PERFORM_CNS.SCORE", drop_first=False)
data = pd.concat([data, X], axis=1)

redundant_fields = ["supplier_id", "UniqueID", "Employment.Type", "Date.of.Birth", "DisbursalDate", "PERFORM_CNS.SCORE", "PERFORM_CNS.SCORE.DESCRIPTION"]
data = data.drop(redundant_fields, axis=1)

data['AVERAGE.ACCT.AGE'] = data['AVERAGE.ACCT.AGE'].map(lambda x: x.lstrip("").rstrip("mon"))
test1 = pd.DataFrame([line.split("yrs ", 1) for line in data['AVERAGE.ACCT.AGE']], columns=['Year', 'Month']).astype(int)
data['AVERAGE.ACCT.AGE'] = (test1['Year'] + test1['Month']/12)

data['CREDIT.HISTORY.LENGTH'] = data['CREDIT.HISTORY.LENGTH'].map(lambda x: x.lstrip("").rstrip("mon"))
test1 = pd.DataFrame([line.split("yrs ", 1) for line in data['CREDIT.HISTORY.LENGTH']], columns=['Year', 'Month']).astype(int)
data['CREDIT.HISTORY.LENGTH'] = (test1['Year'] + test1['Month']/12)

data.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,manufacturer_id,Current_pincode_ID,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,...,PERFORM_CNS.SCORE_36,PERFORM_CNS.SCORE_37,PERFORM_CNS.SCORE_38,PERFORM_CNS.SCORE_39,PERFORM_CNS.SCORE_40,PERFORM_CNS.SCORE_41,PERFORM_CNS.SCORE_42,PERFORM_CNS.SCORE_43,PERFORM_CNS.SCORE_44,PERFORM_CNS.SCORE_890
0,50578,58400,89.55,67,45,1441,6,1998,1,1,...,0,0,0,0,0,0,0,0,0,0
1,47145,65550,73.23,67,45,1502,6,1998,1,1,...,0,0,0,0,0,0,0,0,0,0
2,53278,61360,89.63,67,45,1497,6,1998,1,1,...,0,0,0,0,0,0,0,0,0,0
3,57513,66113,88.48,67,45,1501,6,1998,1,1,...,0,0,0,0,0,0,0,0,0,0
4,52378,60300,88.39,67,45,1495,6,1998,1,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = data.drop('loan_default', 1)
y = data['loan_default']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [11]:
## Import the random forest model.
from sklearn.ensemble import RandomForestClassifier 
## This line instantiates the model. 
rf = RandomForestClassifier() 
## Fit the model on your training data.
rf.fit(X_train, y_train) 
## And score it on your testing data.

y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

#Applying 10-fold cross validation
accuracies = cross_val_score(estimator=rf, X=X_train, y=y_train, cv=10)
print("accuracy (10-fold): ", np.mean(accuracies))



[[52900  2010]
 [14009  1028]]
accuracy (10-fold):  0.7683248983206776


In [7]:
accuracies

array([0.7681044 , 0.76969734, 0.76798186, 0.76992831, 0.76905637,
       0.77003676, 0.77064951, 0.76875   , 0.76654412, 0.77095588])

In [8]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [9]:
feature_importances

Unnamed: 0,importance
ltv,0.113977
disbursed_amount,0.106778
asset_cost,0.103977
Current_pincode_ID,0.099057
Employee_code_ID,0.091629
AgeAtDisbursal,0.085882
branch_id,0.047439
State_ID,0.028534
CREDIT.HISTORY.LENGTH,0.026676
PRI.CURRENT.BALANCE,0.025198
