In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

## Evaluation Metrics and scoring

### Metrics for binary classification

In [None]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("data/bank-campaign.csv")
X = data.drop("target", axis=1).values
y = data.target.values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.dummy import DummyClassifier
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
pred_most_frequent = dummy_majority.predict(X_test)
print("predicted labels: %s" % np.unique(pred_most_frequent))
print("score: %f" % dummy_majority.score(X_test, y_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
pred_tree = tree.predict(X_test)
tree.score(X_test, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression

dummy = DummyClassifier().fit(X_train, y_train)
pred_dummy = dummy.predict(X_test)
print("dummy score: %f" % dummy.score(X_test, y_test))

logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg.predict(X_test)
print("logreg score: %f" % logreg.score(X_test, y_test))

# Confusion matrices

In [None]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, pred_logreg)
print(confusion)

In [None]:
print("Most frequent class:")
print(confusion_matrix(y_test, pred_most_frequent))
print("\nDummy model:")
print(confusion_matrix(y_test, pred_dummy))
print("\nDecision tree:")
print(confusion_matrix(y_test, pred_tree))
print("\nLogistic Regression")
print(confusion_matrix(y_test, pred_logreg))

In [None]:
from sklearn.metrics import f1_score
print("f1 score most frequent: %.2f" % f1_score(y_test, pred_most_frequent, pos_label="yes"))
print("f1 score dummy: %.2f" % f1_score(y_test, pred_dummy, pos_label="yes"))
print("f1 score tree: %.2f" % f1_score(y_test, pred_tree, pos_label="yes"))
print("f1 score logreg: %.2f" % f1_score(y_test, pred_logreg, pos_label="yes"))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_most_frequent,
                            target_names=["no", "yes"]))

In [None]:
print(classification_report(y_test, pred_tree,
                            target_names=["no", "yes"]))

In [None]:
print(classification_report(y_test, pred_logreg,
                            target_names=["no", "yes"]))

# Taking uncertainty into account

## Precision-Recall curves and ROC curves

In [None]:
from sklearn.datasets import make_blobs
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_curve


# create a similar dataset as before, but with more samples to get a smoother curve
X, y = make_blobs(n_samples=8000, centers=2, cluster_std=[7.0, 2], random_state=22, shuffle=False)
X, y = X[:4500], y[:4500]

# build an imbalanced synthetic dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

svc = SVC(gamma=.05).fit(X_train, y_train)

precision, recall, thresholds = precision_recall_curve(
    y_test, svc.decision_function(X_test))
# find threshold closest to zero:
close_zero = np.argmin(np.abs(thresholds))
plt.figure()
plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
         label="threshold zero", fillstyle="none", c='k', mew=2)

plt.plot(precision, recall, label="precision recall curve")
plt.xlabel("precision")
plt.ylabel("recall")
plt.title("precision_recall_curve");
plt.legend(loc="best")

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2)
rf.fit(X_train, y_train)

# RandomForestClassifier has predict_proba, but not decision_function
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(
    y_test, rf.predict_proba(X_test)[:, 1])
plt.figure()

plt.plot(precision, recall, label="svc")

plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
         label="threshold zero svc", fillstyle="none", c='k', mew=2)

plt.plot(precision_rf, recall_rf, label="rf")

close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot(precision_rf[close_default_rf], recall_rf[close_default_rf], '^', markersize=10,
         label="threshold 0.5 rf", fillstyle="none", c='k', mew=2)
plt.xlabel("precision")
plt.ylabel("recall")
plt.legend(loc="best")
plt.title("precision_recall_comparison");

In [None]:
print("f1_score of random forest: %f" % f1_score(y_test, rf.predict(X_test)))
print("f1_score of svc: %f" % f1_score(y_test, svc.predict(X_test)))

In [None]:
from sklearn.metrics import average_precision_score
ap_rf = average_precision_score(y_test, rf.predict_proba(X_test)[:, 1])
ap_svc = average_precision_score(y_test, svc.decision_function(X_test))
print("average precision of random forest: %f" % ap_rf)
print("average precision of svc: %f" % ap_svc)

# Receiver Operating Characteristics (ROC) and AUC
\begin{equation}
\text{FPR} = \frac{\text{FP}}{\text{FP} + \text{TN}}
\end{equation}

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))

plt.figure()

plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.title("roc_curve");
# find threshold closest to zero:
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
         label="threshold zero", fillstyle="none", c='k', mew=2)
plt.legend(loc=4)

In [None]:
from sklearn.metrics import roc_curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])

plt.figure()
plt.plot(fpr, tpr, label="ROC Curve SVC")
plt.plot(fpr_rf, tpr_rf, label="ROC Curve RF")

plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.title("roc_curve_comparison");
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
         label="threshold zero SVC", fillstyle="none", c='k', mew=2)
close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot(fpr_rf[close_default_rf], tpr[close_default_rf], '^', markersize=10,
         label="threshold 0.5 RF", fillstyle="none", c='k', mew=2)

plt.legend(loc=4)

In [None]:
from sklearn.metrics import roc_auc_score
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))
print("AUC for Random Forest: %f" % rf_auc)
print("AUC for SVC: %f" % svc_auc)

## Using evaluation metrics in model selection

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

# default scoring for classification is accuracy
print("default scoring ",
      cross_val_score(LogisticRegression(), cancer.data, cancer.target))
# providing scoring="accuracy" doesn't change the results
explicit_accuracy =  cross_val_score(LogisticRegression(), cancer.data, cancer.target,
                                     scoring="accuracy")
print("explicit accuracy scoring ", explicit_accuracy)
ap =  cross_val_score(LogisticRegression(), cancer.data, cancer.target,
                           scoring="average_precision")
print("average precision", ap)

In [None]:
from sklearn.model_selection import cross_validate
res = cross_validate(SVC(), cancer.data, cancer.target,
                     scoring=["accuracy", "average_precision", "recall_macro"],
                     return_train_score=True, cv=5)
display(pd.DataFrame(res))

In [None]:
from sklearn.metrics.scorer import SCORERS
print(sorted(SCORERS.keys()))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
data = pd.read_csv("data/bank-campaign.csv")

# back to the bank campaign
X = data.drop("target", axis=1).values
y = data.target.values == "no"


X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=.1, test_size=.1, random_state=0)

param_grid = {'C': [0.0001, 0.01, 0.1, 1, 10]}

# using AUC scoring:
grid = GridSearchCV(LogisticRegression(), param_grid=param_grid,
                    scoring=["roc_auc", 'average_precision', 'accuracy'],
                    refit='roc_auc')
grid.fit(X_train, y_train)
print("\nGrid-Search with AUC")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (AUC):", grid.best_score_)
print("Test set AUC: %.3f" % grid.score(X_test, y_test))

In [None]:
res = pd.DataFrame(grid.cv_results_)
res[['mean_test_roc_auc', 'mean_test_accuracy', 'mean_test_average_precision']].plot()

# Exercise
Load the adult dataset from ``data/adult.csv`` (or pick another dataset), and split it into training and test set.
Apply grid-search to the training set, searching for the best C for Logistic Regression using AUC.
Plot the ROC curve and precision-recall curve of the best model on the test set.