# Supervised Learning: Linear Classifiers

In [5]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Datasets

In [6]:
# bank disease data dataset for classification
df = pd.read_csv('creditcard.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
# Minor preprocessing
df['Time'] = df['Time'] / 3600 % 24

In [8]:
df['Class'].value_counts(normalize=True)

0    0.998273
1    0.001727
Name: Class, dtype: float64

In [9]:
X_bank= df[df.columns[df.columns!= 'Class']]
y_bank= df['Class']

print(X_bank.shape)
print(y_bank.shape)

(284807, 30)
(284807,)


### Example based on SVC classifier 

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

X_bank_train, X_bank_test, y_bank_train, y_bank_test = train_test_split(X_bank, y_bank, random_state=0)

svm_clf = Pipeline([
        ("scaler", MinMaxScaler()),
        ("linear_svc", SVC()),
    ])

cv_scores = cross_val_score(svm_clf, X_bank_train, y_bank_train)
print('Cross-validation scores (3-fold):', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'
     .format(np.mean(cv_scores)))

NameError: name 'cross_val_score' is not defined

### Confusion matrices

In [11]:
from sklearn.metrics import confusion_matrix

clf= svm_clf.fit(X_bank_train, y_bank_train)
y_predicted= clf.predict(X_bank_test)

confusion = confusion_matrix(y_bank_test, y_predicted)

print('Confusion matrix\n', confusion)
# first line True negative; second line true positive
# first column Predicted negative; second column predicted positive

Confusion matrix
 [[71072    10]
 [   49    71]]


### Evaluation metrics for binary classification

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 
print('Accuracy: {:.2f}'.format(accuracy_score(y_bank_test, y_predicted)))
print('Precision: {:.2f}'.format(precision_score(y_bank_test, y_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_bank_test, y_predicted)))
print('F1: {:.2f}'.format(f1_score(y_bank_test, y_predicted)))

Accuracy: 1.00
Precision: 0.88
Recall: 0.59
F1: 0.71


In [None]:
# Combined report with all above metrics
from sklearn.metrics import classification_report

print(classification_report(y_bank_test, y_predicted, target_names=['not 1', '1']))

              precision    recall  f1-score   support

       not 1       1.00      1.00      1.00     71082
           1       0.88      0.59      0.71       120

   micro avg       1.00      1.00      1.00     71202
   macro avg       0.94      0.80      0.85     71202
weighted avg       1.00      1.00      1.00     71202



### Decision functions

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(X_bank_train, y_bank_train)
lr_predicted = lr.predict(X_bank_test)
confusion = confusion_matrix(y_bank_test, lr_predicted)
print('Logistic regression classifier (default settings)\n', confusion)

y_scores_lr = lr.fit(X_bank_train, y_bank_train).decision_function(X_bank_test)

# show the decision_function scores for first 20 instances
y_score_list = list(zip(y_bank_test[0:20], y_scores_lr[0:20]))
y_score_list

Logistic regression classifier (default settings)
 [[71071    11]
 [   41    79]]


In [None]:
lr.decision_function

### Precision-recall curves

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_bank_test, y_scores_lr)
closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]

plt.figure()
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.plot(precision, recall, label='Precision-Recall Curve')
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.axes().set_aspect('equal')
plt.show()

### ROC curves, Area-Under-Curve (AUC)

In [None]:
from sklearn.metrics import roc_curve, auc

y_scores_lr = lr.fit(X_bank_train, y_bank_train).decision_function(X_bank_test)
fpr_lr, tpr_lr, _ = roc_curve(y_bank_test, y_scores_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()