In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# ___Model Evaluation & Selection___
---------------

In [1]:
# Typical workflow diagram of ML models,

# 1) Representation -> extract, identify and select features
# 2) Training phase -> fitting the estimator to the dataset
# 3) Evaluation 
# 4) Feature & model refinement

In [2]:
# Refinement -> provides insights into the trained model's performance characteristics.
# This might allow us to switch to better features/ different kernel functions or other parameter tuning.

# Evaluation methods also help to derive surrogate benefits from trained models.
# e.g. a model trained to provide the most relevant results for a user query can also be used to determine the browsing time, frequency of visits and
# pages requested subsequently from a given host.

In [3]:
# This evaluation measures are critical in selecting the best model among a set of trained models.
# Evaluation metric is variable and depends on the use case of the model.
# model accuracy is the most common metric, however other metrics like user satisfaction (in web searches), amount of revenue (business sites)
# and increased patient survival rates (medical applications) can also be used depending on the domain which the models are used.

In [5]:
# If the ML model we are designing is to predict patients' benign tumors developing into cancerous tumors,
# we need the model to be as rigorous as possible. We need it to classify the tumors with meager chances of becoming cancerous as potentially
# dangerous. Even if this will likely increase the fraction of false positives in the predictions.
# Because, given the domain, we cannot afford to falsely classify a cancerous tumor as healthy tissue.
# So, the risking higher number of false positives becomes acceptable.

## ___Accuracy with Imbalanced Classes___
----------

In [6]:
# Suppose we have two classes in the labels,
# 1) relevant (R)
# 2) irrelevant (I)

# In a test set with 1,000 data points,
# one gets classified as relevant (R)
# remaining 999 get classified as irrelevant (I)

# accuracy = N(correct predictions) / N(total predictions)

In [7]:
# Here the accuracy score alone cannot be a sufficient measure for the classifier's performance.
# When the class frequencies distribution is highly skewed, it is called an imbalanced class situation.

# e.g. credit card transactions -> where most of them are legit, with very few illegal transactions.
# reccomendations on an e-commerce site, where only one/two may be relevant to the user while all the remainning suggestions won't even make sense.

In [8]:
# In addition to binary classes, this siuation migh tpresent itself in multiclass prediction problems as well.

In [9]:
# Let's say that we have a binary classifier that predicts with 99.0 % accuracy
# Let's test a dummy classifier that returns the class with highest frequency for all inputs.

# In a highly imbalanced dataset, like (1 relevant and 999 irrelevant),
# The dummy classifer will also show a 999/1000 accuracy!

In [13]:
# So,

print(f"Accuracy -> Dummy classifier = {999 / 1000 * 100}%")
print(f"Accuracy -> Trained classifier = {99.0}%")

Accuracy -> Dummy classifier = 99.9%
Accuracy -> Trained classifier = 99.0%


In [3]:
# A real word example

digits = load_digits()

In [4]:
# This dataset has images of handwritten digits labelled with 10 classes (0 to 9)
# Let's see the distribution opf classes,

np.unique(digits.target, return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([178, 182, 177, 183, 181, 182, 181, 179, 174, 180], dtype=int64))

In [5]:
# Labels seem more/less balancedly distributed.

In [6]:
# let's skew this dataset by convetring all non-zero lables as positive, i.e. 1 (Positive class)
# and keep the 0 labels as the negative class.

skewed_targets = digits.target.copy()
skewed_targets[np.where(skewed_targets != 0)] = 1
skewed_targets

array([0, 1, 1, ..., 1, 1, 1])

In [7]:
np.unique(skewed_targets, return_counts = True)

(array([0, 1]), array([ 178, 1619], dtype=int64))

In [8]:
# Now we have a very skewed target

In [9]:
train_x, test_x, train_y, test_y = train_test_split(digits.data, skewed_targets, train_size = 0.8)

In [10]:
svclassifier = SVC(kernel = "rbf", C = 1).fit(train_x, train_y)

In [11]:
svclassifier.score(train_x, train_y)

1.0

In [12]:
svclassifier.score(test_x, test_y)

1.0

In [13]:
# Now that's a 100% accuracy
# Let's do a control experiment and make a dummy classifier.

In [14]:
dmmyClassifier = DummyClassifier(strategy = "most_frequent").fit(train_x, train_y)

In [15]:
dmmyClassifier.score(train_x, train_y)

0.8970076548364648

In [16]:
dmmyClassifier.score(test_x, test_y)

0.9166666666666666

In [17]:
# Dummy classifiers completely ignore train_x but examine the train_y to determine the frequency of the labels, 
# to figure out the label with the highest frequency.
# Even a classifier that did not truly assimilate the data gives a 87% accuracy score.

In [18]:
# See the homogenous predictions

dmmyClassifier.predict(test_x)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [19]:
# Dummy classifiers provide a NULL accuracy baseline.
# Albeit peing useless for real problems, it provides a way to do sanity checks for comparison.
# There are different strategies available to determine how the dummy classifier is to choose the label.

# 1) most frequent -> returns the label with the highest frequency
# 2) stratified -> random predictions based on the class distribution in the train_y
# 3) uniform -> predictions made randomly and uniformly
# 4) constant -> returns a constant user provided label

## ___What if a Model's Accuracy is Close to the Null Baseline?___
------------------

In [20]:
# This might indicate

# 1) Ineffective, erroneous or missing features
# 2) Poorly chosen kernel or parameters
# 3) Huge disparity in classes (highly imbalanced)

In [21]:
# In general, for datasets with imbalanced classes we should opt for perofrmance metrics other than the accuracy score (R square)
# e.g. AUC score -> Area Under the Curve

## ___Dummy Regressors___
------------

In [22]:
# Dummy regressors are the regression counterparts for dummy classifiers.
# Dummy regressors also have several strategies to determine the output.
# mean -> mean of all values in train_y
# median -> median of all values in train_y
# quantile -> returns a user provided nth quantile of train_y
# constant -> returns a user provided constant.

# Dummy regressors give a null basedline for sanity checks on regression models.

## ___Binary Prediction Outcomes___
--------------

In [23]:
# True positives -> when the real output and the predictions are "true"
# False positives -> when the real output should be "false", but the prediction is "true"

# True negative -> when the real output and the prediction are "false"
# False negative -> when the real output is "true", bu the prediction is "false"

## ___Confusion Matrix___
----------

In [24]:
# A matrix of all combinations of predicted labels and true labels is called a confusion matrix.

| State | Positives | Negatives |
| :---- | :--------- | :-------- |
| True |5          | 356 |
| False| 51 | 38|

### __$N = 450$__

In [25]:
# We can take any prediction of the classifier and categorize it into one of the cells of this matrix.
# Confusion matrices are aslo possible for multiclass classifications, instead of a 2x2 matrix we'd have a kxk matrix where k indicates the 
# number of classes.

In [39]:
target = digits.target.copy()
target[np.where(target != 0)] = 1
train_x, test_x, train_y, test_y = train_test_split(digits.data, target, train_size = 0.8)

In [40]:
dmyClassifier = DummyClassifier(strategy = "most_frequent").fit(train_x, train_y)
preds = dmyClassifier.predict(test_x)

In [41]:
confusion_matrix(y_true = test_y, y_pred = preds)

array([[  0,  35],
       [  0, 325]], dtype=int64)

In [38]:
# Successfull predictions are along the diagonal axis of the matrix, where the true class matches the predicted class.

# Right column is where the classifier predicted the 1 label (majority)
# Left column is where the prediction was label 0 -> none made since it is not the most frequent label.

# So, in the confusion matrix we only have true positives and false positives .
# True positives -> 332
# False positives -> 28

# No negatives since our classifier did not predict any negative (0) labels.

In [42]:
# Let's see are there any changes when we change the strategy,

dmyClassifier = DummyClassifier(strategy = "stratified").fit(train_x, train_y)
preds = dmyClassifier.predict(test_x)
confusion_matrix(y_true = test_y, y_pred = preds)

array([[  5,  30],
       [ 32, 293]], dtype=int64)

In [43]:
# Now the left column is no longer all zeroes, since this classifier did predict some negative (0) labels.

In [45]:
svclassifier = SVC(kernel = "linear", C = 1).fit(train_x, train_y)
preds = svclassifier.predict(test_x)
confusion_matrix(y_true = test_y, y_pred = preds)

array([[ 35,   0],
       [  0, 325]], dtype=int64)

In [46]:
# No incorrect predictions.

In [52]:
logReg = LogisticRegression(max_iter = 10000).fit(train_x, train_y)
preds = logReg.predict(test_x)
confusion_matrix(y_true = test_y, y_pred = preds)

array([[ 35,   0],
       [  0, 325]], dtype=int64)

In [53]:
# Again, no incorrect predictions.

In [66]:
dtClassifier = DecisionTreeClassifier(min_samples_leaf = 10, max_depth = 5).fit(train_x, train_y)
preds = dtClassifier.predict(test_x)
confusion_matrix(y_true = test_y, y_pred = preds)

array([[ 33,   2],
       [  0, 325]], dtype=int64)

In [None]:
# 2 False positives!