In [54]:
import warnings
warnings.filterwarnings('ignore')

# Data processing and manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
import seaborn as sns

# Configure visualization
%matplotlib inline
plb.rcParams['figure.figsize'] = 10, 8
mpl.style.use('ggplot')
sns.set_style('white')

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Accuracy Metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
full = pd.concat([train,test], axis = 0)
print(full.describe(include = [int, float, object]))

        ApplicantIncome  CoapplicantIncome  Credit_History Dependents  \
count        981.000000         981.000000      902.000000        956   
unique              NaN                NaN             NaN          4   
top                 NaN                NaN             NaN          0   
freq                NaN                NaN             NaN        545   
mean        5179.795107        1601.916330        0.835920        NaN   
std         5695.104533        2718.772806        0.370553        NaN   
min            0.000000           0.000000        0.000000        NaN   
25%         2875.000000           0.000000        1.000000        NaN   
50%         3800.000000        1110.000000        1.000000        NaN   
75%         5516.000000        2365.000000        1.000000        NaN   
max        81000.000000       41667.000000        1.000000        NaN   

       Education Gender  LoanAmount  Loan_Amount_Term   Loan_ID Loan_Status  \
count        981    957  954.000000        9

In [3]:
full.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed
0,5849,0.0,1.0,0,Graduate,Male,,360.0,LP001002,Y,No,Urban,No
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No


In [4]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB
None


In [5]:
full.columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents',
       'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Loan_ID',
       'Loan_Status', 'Married', 'Property_Area', 'Self_Employed'],
      dtype='object')

##### Imputing the quantitative variables with median values.

In [6]:
full.Loan_Amount_Term.fillna(full.Loan_Amount_Term.median(), inplace = True)
full.LoanAmount.fillna(full.LoanAmount.median(), inplace = True)

##### Imputing the categorical variables with most common response

In [7]:
full.Gender.fillna('Male', inplace = True)
full.Dependents.fillna('0', inplace = True)
full.Credit_History.fillna(1, inplace = True)
full.Self_Employed.fillna('No', inplace = True)
full.Married.fillna('Yes', inplace = True)
train = full.iloc[:614,:]
test = full.iloc[614:, :]logistic

## Feature Engineering

In [8]:
full['TotalIncome'] = full.ApplicantIncome + full.CoapplicantIncome
full['IncomeDebtRatio'] = full.TotalIncome / full.LoanAmount
train = full.iloc[:614,:]
test = full.iloc[614:, :]
full.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed,TotalIncome,IncomeDebtRatio
0,5849,0.0,1.0,0,Graduate,Male,126.0,360.0,LP001002,Y,No,Urban,No,5849.0,46.420635
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No,6091.0,47.585938
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes,3000.0,45.454545
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No,4941.0,41.175
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No,6000.0,42.553191


### Manually encoding the nominal and ordinal variables.

In [103]:
features = ['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents',
       'Education', 'LoanAmount', 'Loan_Amount_Term', 
       'Married', 'Property_Area', 
       'TotalIncome', 'IncomeDebtRatio']
response = ['Loan_Status']
X = pd.get_dummies(train[features])
y = train[response]

In [105]:
def model_classifier(model,X, y):
    #Fitting the model with training data
    model.fit(X, y)
    
    #Calculating the Train Accuracy:
    train_predict = model.predict(X)
    print('Train Accurcay Score : ', accuracy_score(y, train_predict))
    
    #Calculating the cross validation score
    kfold = KFold(n_splits = 4)
    scores = []
    
    # Performing K Fold Cross validation test.
    for train_index, test_index in kfold.split(X):
        model.fit(X.iloc[train_index,:], y.iloc[train_index,:])
        prediction = model.predict(X.iloc[test_index, :])
        scores.append(accuracy_score(y.iloc[test_index, :], prediction))
    
    #Printing the K Fold validation score
    print('\nKFold Score : ', np.mean(scores))
    
    # Since this is a classification problem 
    # Printing the confusion matrix
    conf_mat = confusion_matrix(y.iloc[test_index,:], prediction)
    print('\nConfusion Matrix')
    print(conf_mat)
    print('\nProportioanl confusion_matrix\n')
    print(conf_mat/conf_mat[:].sum())    
    
    # Print the ROC-AUC score
    # roc_auc_score(y.iloc[test_index,:], prediction)
    

### Trying different Classification Models.

#### Random Forest Classifier

In [106]:
randomforest = RandomForestClassifier()
model_classifier(randomforest, X, y)

Train Accurcay Score :  0.990228013029316

KFold Score :  0.767156862745098

Confusion Matrix
[[28 20]
 [14 91]]

Proportioanl confusion_matrix

[[0.18300654 0.13071895]
 [0.09150327 0.59477124]]


#### Logistic Regression

In [67]:
logistic = LogisticRegression()
model_classifier(logistic, X, y)

Train Accurcay Score :  0.8110749185667753

KFold Score :  0.7997623291740938

Confusion Matrix
[[ 25  23]
 [  4 101]]


#### Support Vector Classifier

In [68]:
svc = SVC()
model_classifier(svc, X, y)

Train Accurcay Score :  1.0

KFold Score :  0.69058229352347

Confusion Matrix
[[  1  47]
 [  0 105]]


#### Decision Tree Classifier

In [69]:
decisiontree = DecisionTreeClassifier()
model_classifier(decisiontree, X, y)

Train Accurcay Score :  1.0

KFold Score :  0.685733384262796

Confusion Matrix
[[27 21]
 [26 79]]


#### Naive Bayes

In [70]:
bayes = GaussianNB()
model_classifier(bayes, X, y)

Train Accurcay Score :  0.7947882736156352

KFold Score :  0.7899477973007385

Confusion Matrix
[[27 21]
 [ 8 97]]


#### K Nearest Neighbors

In [71]:
kneighbor = KNeighborsClassifier(n_neighbors= 17)
model_classifier(kneighbor, X, y)

Train Accurcay Score :  0.6954397394136808

KFold Score :  0.6661785926491809

Confusion Matrix
[[  5  43]
 [  5 100]]


#### Gradient Boosting Classifier

In [72]:
gbclassifier = GradientBoostingClassifier()
model_classifier(gbclassifier, X, y)

Train Accurcay Score :  0.8990228013029316

KFold Score :  0.7964837450131568

Confusion Matrix
[[26 22]
 [ 9 96]]


#### Adaptive Boosting Classifier

In [73]:
abclassifier = AdaBoostClassifier()
model_classifier(abclassifier, X, y)

Train Accurcay Score :  0.8648208469055375

KFold Score :  0.7980540701128935

Confusion Matrix
[[23 25]
 [ 9 96]]


#### XGBoost Classifier

In [92]:
xgb = XGBClassifier(objective= 'binary:logistic', eval_metric = 'auc', scale_pos_weight=0.35, early_stopping_rounds=100)
model_classifier(xgb, X, y)

Train Accurcay Score :  0.8794788273615635

KFold Score :  0.7426682794329853

Confusion Matrix
[[31 17]
 [23 82]]
