# Aim
To compare the effectiveness of classification algorithms KNN, Decision Trees, SVM and Logistic Regression are compared on the same data set

## About the Data
This dataset is about past loans. The models are built using the details of 346 customers whose loan are already paid off or defaulted. It includes following fields:

| Field          | Description                                                                           |
|----------------|---------------------------------------------------------------------------------------|
| Loan_status    | Whether a loan is paid off on in collection                                           |
| Principal      | Basic principal loan amount at the                                                    |
| Terms          | Origination terms which can be weekly (7 days), biweekly, and monthly payoff schedule |
| Effective_date | When the loan got originated and took effects                                         |
| Due_date       | Since it’s one-time payoff schedule, each loan has one single due date                |
| Age            | Age of applicant                                                                      |
| Education      | Education of applicant                                                                |
| Gender         | The gender of applicant                                                               |

## Exploring the Data

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline

In [None]:
df = pd.read_csv('loan_train.csv')
df.head()

In [None]:
df.shape

In [None]:
df['due_date'] = pd.to_datetime(df['due_date'])
df['effective_date'] = pd.to_datetime(df['effective_date'])
df.head()

In [None]:
df['loan_status'].value_counts()

In [None]:
import seaborn as sns

bins = np.linspace(df.Principal.min(), df.Principal.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'Principal', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
bins=np.linspace(df.age.min(), df.age.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'age', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
df['dayofweek'] = df['effective_date'].dt.dayofweek
bins=np.linspace(df.dayofweek.min(), df.dayofweek.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'dayofweek', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()


## Processing the Data

In [None]:
df['weekend']= df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
df.head()

In [None]:
df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)

In [None]:
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
df.head()

In [None]:
df.groupby(['education'])['loan_status'].value_counts(normalize=True)

In [None]:
df[['Principal','terms','age','Gender','education']].head()

In [None]:
Feature = df[['Principal','terms','age','Gender','weekend']]
Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)
Feature.drop(['Master or Above'], axis = 1,inplace=True)
Feature.head()


In [None]:
X = Feature
X[0:5]

In [None]:
y = df['loan_status'].values
y[0:5]

In [None]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

In [None]:
# We split the X into train and test to find the best k
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

# Building the Models

## KNN

In [None]:
# Modeling
from sklearn.neighbors import KNeighborsClassifier
k = 3
#Train Model and Predict  
kNN_model = KNeighborsClassifier(n_neighbors=k).fit(X_train,y_train)
kNN_model

In [None]:
# just for sanity chaeck
yhat = kNN_model.predict(X_test)
yhat[0:5]

In [None]:
# Best k
Ks=15
mean_acc=np.zeros((Ks-1))
std_acc=np.zeros((Ks-1))
ConfustionMx=[];
for n in range(1,Ks):
    
    #Train Model and Predict  
    kNN_model = KNeighborsClassifier(n_neighbors=n).fit(X_train,y_train)
    yhat = kNN_model.predict(X_test)
    
    
    mean_acc[n-1]=np.mean(yhat==y_test);
    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
mean_acc

In [None]:
# Building the model again, using k=7
k = 7
#Train Model and Predict  
kNN_model = KNeighborsClassifier(n_neighbors=k).fit(X,y)
kNN_model

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT_model = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
DT_model.fit(X_train,y_train)
DT_model

In [None]:
for split in ["best", "random"]:
    for depth in range(3,6):
        DT_model = DecisionTreeClassifier(criterion='entropy',splitter=split,max_depth= depth)
        DT_model.fit(X_train,y_train)
        yhat = DT_model.predict(X_test)
        print('For split =',split,'and depth =',depth,(y_test==yhat).sum(),'out of',len(y_test),'are correct')

In [None]:
DT_model = DecisionTreeClassifier(criterion='entropy',splitter='random',max_depth=4).fit(X,y)

## SVM

In [None]:
from sklearn import svm
SVM_model = svm.SVC()
SVM_model.fit(X_train, y_train) 

In [None]:
for c in [0.01,0.1,1]:
    SVM_model = svm.SVC(C=c).fit(X_train,y_train)
    yhat = SVM_model.predict(X_test)
    print('For C =',c,(y_test==yhat).sum(),'out of',len(y_test),'are correct')

In [None]:
SVM_model = svm.SVC(C=1).fit(X,y)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(C=0.01).fit(X_train,y_train)
LR_model

In [None]:
for c in [0.01,0.1,1]:
    LR_model= LogisticRegression(C=c).fit(X_train,y_train)
    yhat = LR_model.predict(X_test)
    print('For C =',c,(y_test==yhat).sum(),'out of',len(y_test),'are correct')

In [None]:
LR_model = LogisticRegression(C=1).fit(X,y)
# C = 0.01 and 0.1 predict only 'Payoff' 

# Model Evaluation

In [None]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [None]:
test_df = pd.read_csv('loan_test.csv')
test_df.head()

In [None]:
## Preprocessing
test_df['due_date'] = pd.to_datetime(test_df['due_date'])
test_df['effective_date'] = pd.to_datetime(test_df['effective_date'])
test_df['dayofweek'] = test_df['effective_date'].dt.dayofweek
test_df['weekend'] = test_df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
test_df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)

test_Feature = test_df[['Principal','terms','age','Gender','weekend']]
test_Feature = pd.concat([test_Feature,pd.get_dummies(test_df['education'])], axis=1)
test_Feature.drop(['Master or Above'], axis = 1,inplace=True)

test_X = preprocessing.StandardScaler().fit(test_Feature).transform(test_Feature)
test_X[0:5]

In [None]:
test_y = test_df['loan_status'].values
test_y[0:5]

#### KNN

In [None]:
knn_yhat = kNN_model.predict(test_X)
print("KNN Jaccard index: %.2f" % jaccard_similarity_score(test_y, knn_yhat))
print("KNN F1-score: %.2f" % f1_score(test_y, knn_yhat, average='weighted') )

#### Decision Tree

In [None]:
DT_yhat = DT_model.predict(test_X)
print("DT Jaccard index: %.2f" % jaccard_similarity_score(test_y, DT_yhat))
print("DT F1-score: %.2f" % f1_score(test_y, DT_yhat, average='weighted') )

#### SVM

In [None]:
SVM_yhat = SVM_model.predict(test_X)
print("SVM Jaccard index: %.2f" % jaccard_similarity_score(test_y, SVM_yhat))
print("SVM F1-score: %.2f" % f1_score(test_y, SVM_yhat, average='weighted') )

#### Logistic Regression

In [None]:
LR_yhat = LR_model.predict(test_X)
LR_yhat_prob = LR_model.predict_proba(test_X)
print("LR Jaccard index: %.2f" % jaccard_similarity_score(test_y, LR_yhat))
print("LR F1-score: %.2f" % f1_score(test_y, LR_yhat, average='weighted') )
print("LR LogLoss: %.2f" % log_loss(test_y, LR_yhat_prob))

## Report

In [None]:
models = {'KNN':knn_yhat,'Decision Tree':DT_yhat,'SVM':SVM_yhat,'Logistic Regression':LR_yhat}
Jacc = []
F1 = []
Logloss = []
for i in models.keys():
    Jacc.append(jaccard_similarity_score(test_y, models[i]))
    F1.append(f1_score(test_y,models[i],average='weighted'))
    if i == 'Logistic Regression':
        Logloss.append(log_loss(test_y, LR_yhat_prob))
    else:
        Logloss.append(np.NAN)
report_df = pd.DataFrame({'Jaccard':Jacc,'F1-score':F1,'LogLoss':Logloss},index=models.keys())
report_df.index.name = 'Algorithm'
report_df

For the given training set, *KNN* appears to be the best classifier.

| Algorithm          | Jaccard| F1-score | LogLoss |
|--------------------|--------|----------|---------|
| KNN                | 0.72   | 0.70     | NA      |
| Decision Tree      | 0.68   | 0.65     | NA      |
| SVM                | 0.72   | 0.62     | NA      |
| LogisticRegression | 0.76   | 0.67     | 0.48    |
