# Comparing classification performance

In this notebook, we use the loaon example data set and evaluate several classification algorithms. 

In [1]:
# general purpose libraries libraries for tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

# models
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# stat libraries
from scipy import stats

# Libraries for the evaluation
from sklearn import model_selection
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

%matplotlib inline

First we load the dataset

In [3]:
loans = pd.read_csv('./data/loans-numerical.csv')

Unnamed: 0,sub_grade_num,short_emp,emp_length_num,dti,payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero,inq_last_6mths,last_delinq_none,last_major_derog_none,...,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment,num_term,grade_num,loan_amnt,safe_loans
0,0.4,0,11,27.65,8.1435,0.0,1.0,1.0,1,1,...,10.65,861.07,24000.0,5000,4975,162.87,36,5,5000,1
1,0.8,1,1,1.0,2.3932,0.0,1.0,5.0,1,1,...,15.27,435.17,30000.0,2500,2500,59.83,60,4,2500,-1
2,1.0,0,11,8.72,8.25955,0.0,1.0,2.0,1,1,...,15.96,603.65,12252.0,2400,2400,84.33,36,4,2400,1
3,0.2,0,11,20.0,8.27585,0.0,1.0,1.0,0,1,...,13.49,2209.33,49200.0,10000,10000,339.31,36,4,10000,1
4,0.8,0,4,11.2,5.21533,0.0,1.0,3.0,1,1,...,7.9,631.38,36000.0,5000,5000,156.46,36,6,5000,1


In [None]:
target = 'safe_loans'
features = loans.columns[loans.columns!=target]

x = loans[features]
y = loans[target]

First we apply a logistic regression without regularization ($\alpha$ is zero). The Scikit-learn function does not allow to specify $\alpha$ but it uses a parameter C=1/$\alpha$. Accordingly, to have no regularization we need to specify a huge value of C. And we evaluate the model using plain crossvalidation.

In [None]:
simple_logistic = linear_model.LogisticRegression(C=10e10, random_state=0)
simple_eval = model_selection.cross_val_score(simple_logistic, x, y, cv=StratifiedKFold(n_splits=10,random_state=52725,shuffle=True))
print("Simple Logistic Regression\t%4.3f\t%4.3f" % (np.average(simple_eval), np.std(simple_eval)))

In [None]:
nb = naive_bayes.GaussianNB();
nb_eval = model_selection.cross_val_score(nb, x, y, cv=StratifiedKFold(n_splits=10,random_state=52725,shuffle=True))
print("Naive Bayes\t%4.3f\t%4.3f" % (np.average(nb_eval), np.std(nb_eval)))

In [None]:
knn = neighbors.KNeighborsClassifier(20, weights='distance')
knn_eval = model_selection.cross_val_score(knn, x, y, cv=StratifiedKFold(n_splits=10,random_state=52725,shuffle=True))
print("k Nearest Neighbor\t%4.3f\t%4.3f" % (np.average(knn_eval), np.std(knn_eval)))

In [None]:
rf = RandomForestClassifier(n_estimators=40, max_depth=None, min_samples_split=2, random_state=0)
rf_eval = model_selection.cross_val_score(rf, x, y, cv=StratifiedKFold(n_splits=10,random_state=52725,shuffle=True))
print("Random Forest\t%4.3f\t%4.3f" % (np.average(rf_eval), np.std(rf_eval)))

# Comparing Two Models using Unpaired t-test

Although all the models have been evaluated have been evaluated with cross-validation, we don't know whether the folds are the same. Thus for instance we don't know if the first element in the evaluation array was measured on the same fold over all the algorithms. Accordingly, we need to apply unpaired t-Test.
Let's compare the performance of Logistic Regression vs Random Forests.

In [None]:
def PrintSignificance(stat, c):
    if (stat[1]<(1-c)):
        print("The difference is statistically significant (cf %3.2f)"%c)
    else:
        print("The difference is not statistically significant (cf %3.2f)"%c)
        
unpaired_lr_rf = stats.ttest_ind(simple_eval, rf_eval)
print("Logistic Regression vs Random Forests: p-value = %4.3f"%unpaired_lr_rf[1])
PrintSignificance(unpaired_lr_rf, 0.95)
print("\n")

unpaired_lr_nb = stats.ttest_ind(simple_eval, nb_eval)
print("Logistic Regression vs Naive Bayes: p-value = %4.3f"%unpaired_lr_nb[1])
PrintSignificance(unpaired_lr_nb, 0.95)
print("\n")

# Comparing Two Models using Paired t-Test

We compare k-nearest-neighbor and random forests using the same folds.

In [None]:
tf = model_selection.KFold(n_splits=10)

knn_scores = []
rf_scores = []

tf.split(loans)
for train, test in tf.split(loans):
    l = loans.loc[train]
    train_x = l[features]
    train_y = l[target]
    
    l_test = loans.loc[test]
    test_x = l_test[features]
    test_y = l_test[target]
    
    knn = neighbors.KNeighborsClassifier(20, weights='distance')
    knn = knn.fit(train_x, train_y)
    acc_knn = accuracy_score(test_y, knn.predict(test_x))

    rf = RandomForestClassifier(n_estimators=40, max_depth=None, min_samples_split=2, random_state=0)
    rf = rf.fit(train_x, train_y)
    acc_rf = accuracy_score(test_y, rf.predict(test_x))
    
    knn_scores = knn_scores + [acc_knn]
    rf_scores = rf_scores + [acc_rf]
    
print("k Nearest Neighbor\t%4.3f\t%4.3f" % (np.average(knn_scores), np.std(knn_scores)))
print("Random Forest     \t%4.3f\t%4.3f" % (np.average(rf_scores), np.std(rf_scores)))

In [None]:
paired_test = stats.ttest_rel(knn_scores, rf_scores)
print("k-nn vs Random Forests: p-value = %4.3f"%paired_test[1])
PrintSignificance(paired_test, 0.95)

In [None]:
# Receiver Operating Characteristic (ROC) Curves¶

We now plot the ROC curve for Logistic Regression and Naive Bayes

In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(x, y, test_size=0.33, random_state=42)

simple_logistic = linear_model.LogisticRegression(C=10e10, random_state=0)
simple_eval = simple_logistic.fit(train_x, train_y)
lr_pred = simple_logistic.predict(test_x);
lr_acc = accuracy_score(y_true=test_y, y_pred=lr_pred)
print("Logistic Regression\t%4.3f" % lr_acc)

nb = naive_bayes.GaussianNB();
nb_eval = nb.fit(train_x, train_y)
nb_pred = nb.predict(test_x)
nb_acc = accuracy_score(y_true=test_y, y_pred=nb_pred)
print("Naive Bayes        \t%4.3f" % nb_acc)

rf = RandomForestClassifier(n_estimators=40, max_depth=None, min_samples_split=2, random_state=0)
rf_eval = rf.fit(train_x, train_y)
rf_pred = rf.predict(test_x)
rf_acc = accuracy_score(y_true=test_y, y_pred=rf_pred)
print("Random Forest      \t%4.3f" % rf_acc)


lr_prob = simple_eval.predict_proba(test_x)
nb_prob = nb_eval.predict_proba(test_x)
rf_prob = rf_eval.predict_proba(test_x)

fpr, tpr, thresholds = roc_curve(y_true=test_y, y_score = lr_prob[:,1], pos_label=1)
fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_true=test_y, y_score = nb_prob[:,1], pos_label=1)
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_true=test_y, y_score = rf_prob[:,1], pos_label=1)



lr_roc_auc = roc_auc_score(y_true=test_y, y_score = lr_prob[:,1])
nb_roc_auc = roc_auc_score(y_true=test_y, y_score = nb_prob[:,1])
rf_roc_auc = roc_auc_score(y_true=test_y, y_score = rf_prob[:,1])



plt.figure(1, figsize=(8, 8));
font = {'family':'sans', 'size':24};
plt.rc('font', **font);
plt.xlabel('FPR');
plt.ylabel('TPR');
plt.plot(fpr,tpr,label='Logistic Regression (%3.2f)'%lr_roc_auc)
plt.plot(fpr_nb,tpr_nb,label='Naive Bayes (%3.2f)'%nb_roc_auc)
plt.plot(fpr_rf,tpr_rf,label='Random Forest (%3.2f)'%rf_roc_auc)
plt.yticks(np.arange(0.0,1.01,.2))
plt.legend()
plt.show();

So while accuracy is quite different (almost four percentage points) the AUC for Logistic Regression and Naive Bayes is quite similar. This is not surprising as the two metrics measures different aspects of performance. The ROC curve shows that the two approaches behave similarly when it comes to TPR and FPR. Random forests perform better both as accuracy and AUC.