# Logistic regression: The Zero detector

We will again solve the MNIST handwritten image recognition problem.

In [None]:
#Common imports
import numpy as np
from pprint import pprint 

#to make this notebook's output stable across runs
np.random.seed(42)

#Sklearn specfic imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression,LogisticRegressionCV
from sklearn.model_selection import cross_validate, RandomizedSearchCV, cross_val_predict, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, roc_auc_score

#scipy 
from scipy.stats import loguniform

#To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

#global settings
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
mpl.rc('figure', figsize=(8,6))


In [None]:
# Ignore all warning by sklearn
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

# Handwritten Digit Classification

In [None]:
# get data
from sklearn.datasets import fetch_openml

X_pd, y_pd = fetch_openml('mnist_784', version=1, return_X_y=True) 

In [None]:
X = X_pd.to_numpy()
y = y_pd.to_numpy()

## Visualisation

In [None]:
num_images = 9 # Choose a square number
factor = int(np.sqrt(num_images))
fig, ax = plt.subplots(nrows = factor, ncols = factor, figsize = (8, 6))
idx_offset = 0 # take "num_images" starting from the index "idx_offset"
for i in range(factor):
    index = idx_offset + i*(factor)
    for j in range(factor):
        ax[i, j].imshow(X[index + j].reshape(28, 28), cmap = 'gray')
        ax[i, j].set_title('Label:{0}'.format(str(y[index + j])))
        ax[i, j].set_axis_off()

## Pre-Processing

* Unlike perceptron, where scaling is optional, sigmoid requires scaling between 0 and 1
* Do not apply mean centering as it removes zeros from the data. Zeros should be kept as zeros in the data.
* we are not using pipeline, since there is just the one preprocessing step.

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

## Data Splitting

In [None]:
x_train, x_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

Checking for imbalance

In [None]:
plt.figure(figsize=(10,4))
sns.histplot(data = np.int8(y_train), binwidth = 0.45, bins = 11)
plt.xticks(ticks=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
plt.xlabel('Class')
plt.title('Distribution of samples')
plt.show()

## Binary classifcation: 0-detector

In [None]:
# Initialize new variable names with all 1
y_train_0 = np.zeros((len(y_train)))
y_test_0 =  np.zeros((len(y_test)))

#find indices of the digit 0 image
indx_0 = np.where(y_train == '0')
y_train_0[indx_0] = 1

indx_0 = np.where(y_test == '0')
y_test_0[indx_0] = 1

#### Sanity check

In [None]:

num_images = 9 # Choose a square number
factor = int(np.sqrt(num_images))
fig, ax = plt.subplots(nrows = factor, ncols = factor, figsize = (8, 6))
idx_offset = 0 # take "num_images" starting from the index "idx_offset"
for i in range(factor):
    index = idx_offset + i*(factor)
    for j in range(factor):
        ax[i, j].imshow(X[index + j].reshape(28, 28), cmap = 'gray')
        ax[i, j].set_title('Label:{0}'.format(str(y_train_0[index + j])))
        ax[i, j].set_axis_off()

In [None]:
print(np.where(y_train=='0'))
print(np.where(y_train_0==1))

### Baseline Model


Let's quickly construct a baseline model witht he following rule

1. Count the number of samples per class
2. The model always outputs the class which has highest number of samples.
3. Then calculate the accuracy of the baseline model.

In [None]:
num_pos = len(np.where(y_train_0 == 1)[0])
num_neg = len(np.where(y_train_0 == 0)[0])
print(num_pos, num_neg)

In [None]:
base_clf = DummyClassifier(strategy='most_frequent')
base_clf.fit(x_train, y_train_0)
print('Training accuracy:{0:0.2f}'.format(base_clf.score(x_train, y_train_0)))
print('Testing accuracy:{0:0.2f}'.format(base_clf.score(x_test, y_test_0)))
print('Score: ', base_clf.score(x_train, y_train_0))

### Logistic Regression model with `SGDClassifier`

#### Training without regularisation

In [None]:
bin_sgd_clf = SGDClassifier(loss='log',
                            penalty='l2',
                            max_iter=1,
                            warm_start=True,
                            eta0=0.01,
                            alpha=0,
                            learning_rate='constant',
                            random_state=1729)

Loss = []
iterations = 100
for i in range(iterations):
    bin_sgd_clf.fit(x_train, y_train_0)
    y_pred = bin_sgd_clf.predict_proba(x_train)
    Loss.append(log_loss(y_train_0, y_pred))

In [None]:
plt.figure()
plt.plot(np.arange(iterations), Loss)
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

In [None]:
print('Training accuracy:{0:0.2f}'.format(bin_sgd_clf.score(x_train, y_train_0)))
print('Testing accuracy:{0:0.2f}'.format(bin_sgd_clf.score(x_test, y_test_0)))

In [None]:
y_hat_train_0 = bin_sgd_clf.predict(x_train)
cm_display = ConfusionMatrixDisplay.from_predictions(y_train_0, y_hat_train_0, values_format='.5g')
plt.show()

In [None]:
print(classification_report(y_train_0, y_hat_train_0))

#### Training with cross_validation

In [None]:
estimator = SGDClassifier(loss='log',
                            penalty='l2',
                            max_iter=100,
                            warm_start=False,
                            eta0=0.01,
                            alpha=0,
                            learning_rate='constant',
                            random_state=1729)

In [None]:
cv_bin_clf = cross_validate(estimator, x_train, y_train_0, cv=5,
                            scoring=['precision', 'recall', 'f1'],
                            return_train_score=True,
                            return_estimator=True)

pprint(cv_bin_clf)

In [None]:
weights = bin_sgd_clf.coef_
bias = bin_sgd_clf.intercept_
print('Dimension of weights w: {0}'.format(weights.shape))
print('Bias: {0}'.format(bias))

In [None]:
plt.figure()
plt.plot(np.arange(0,784), weights[0,:])
plt.xlabel('Feature index')
plt.ylabel('Weight value')
plt.ylim((np.min(weights)-5, np.max(weights)+5))
plt.grid

* a lot of weights seems to have zero values. Let's find out how many

In [None]:
num_zero_w = weights.shape[1] - np.count_nonzero(weights)
print('Number of weights with value zero: %f' %num_zero_w)

* As such, regularisation is not required since there aren't any weight vectors which blow up. BUt we will go ahead for the purpose of demonstration.

#### Training with regularisation

In [None]:
bin_clf_sgd_l2 = SGDClassifier(loss='log',
                                penalty = 'l2',
                                alpha=0.001,
                                max_iter=1,
                                eta0=0.01,
                                warm_start=True,
                                learning_rate='constant',
                                random_state=1729)

Loss = []
iterations = 100
for i in range(iterations):
    bin_clf_sgd_l2.fit(x_train, y_train_0)
    y_pred = bin_clf_sgd_l2.predict_proba(x_train)
    Loss.append(log_loss(y_train_0, y_pred))

In [None]:
plt.figure()
plt.plot(np.arange(iterations), Loss)
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

In [None]:
weights = bin_clf_sgd_l2.coef_
bias = bin_clf_sgd_l2.intercept_
print('Dimension of weights w: {0}'.format(weights.shape))
print('Bias: {0}'.format(bias)) 

In [None]:
plt.figure()
plt.plot(np.arange(0,784), weights[0,:])
plt.xlabel('Feature index')
plt.ylabel('Weight value')
plt.ylim((np.min(weights)-5, np.max(weights)+5))
plt.grid

In [None]:
num_zero_w = weights.shape[1] - np.count_nonzero(weights)
print('Number of weights with value zero: %f' %num_zero_w)

In [None]:
print('Training accuracy %.2f' %bin_clf_sgd_l2.score(x_train, y_train_0))
print('Testing accuracy %.2f' %bin_clf_sgd_l2.score(x_test, y_test_0))

In [None]:
y_hat_train_0 = bin_clf_sgd_l2.predict(x_train)
cm_display = ConfusionMatrixDisplay.from_predictions(y_train_0, y_hat_train_0, values_format='.5g')
plt.show()

In [None]:
print(classification_report(y_train_0, y_hat_train_0))

Let's display a few images and their prediction

In [None]:
y_hat_test_0 = bin_clf_sgd_l2.predict(x_test)
num_images = 9 # Choose a square number
factor = int(np.sqrt(num_images))
fig, ax = plt.subplots(nrows = factor, ncols = factor, figsize = (8, 6))
idx_offset = 0 # take "num_images" starting from the index "idx_offset"
for i in range(factor):
    index = idx_offset + i*(factor)
    for j in range(factor):
        ax[i, j].imshow(x_test[index + j].reshape(28, 28), cmap = 'gray')
        ax[i, j].set_title('Prediction:{0}'.format(str(y_hat_test_0[index + j])))
        ax[i, j].set_axis_off()

In [None]:
indx_0 = np.where(y_test_0 == 1)

zeroImgs = x_test[indx_0[0]]
zeroLabls = y_hat_test_0[indx_0[0]]

num_images = 9 # Choose a square number
factor = int(np.sqrt(num_images))
fig, ax = plt.subplots(nrows = factor, ncols = factor, figsize = (8, 6))
idx_offset = 0 # take "num_images" starting from the index "idx_offset"
for i in range(factor):
    index = idx_offset + i*(factor)
    for j in range(factor):
        ax[i, j].imshow(zeroImgs[index + j].reshape(28, 28), cmap = 'gray')
        ax[i, j].set_title('Prediction:{0}'.format(str(zeroLabls[index + j])))
        ax[i, j].set_axis_off()

#### Hyper parameter tuning

In [None]:
# Learning rate values to try
lr_grid = loguniform(1e-2, 1e-1)

estimator = SGDClassifier(loss='log',
                            penalty='l2',
                            max_iter=1,
                            warm_start=True,
                            eta0=0.01,
                            alpha=0,
                            learning_rate='constant',
                            random_state=1729)

In [None]:
scores = RandomizedSearchCV(estimator,
                            param_distributions={'eta0': lr_grid},
                            cv = 5,
                            n_iter = 5,
                            refit = 'f1')

In [None]:
scores.fit(x_train, y_train_0)

In [None]:
pprint(scores.cv_results_)

In [None]:
best_bin_clf = scores.best_estimator_

In [None]:
y_hat_train_best_0 = best_bin_clf.predict(x_train)

In [None]:
print(classification_report(y_train_0, y_hat_train_best_0))

In [None]:
y_scores = best_bin_clf.decision_function(x_train)
precisions, recalls, thresholds = precision_recall_curve(y_train_0, y_scores)

plt.figure(figsize=(10,4))
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.xlabel('Threshold')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,4))
plt.plot(precisions[:-1], recalls[:-1], "b--")
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.grid(True)
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_train_0, y_scores)
plt.figure(figsize=(10,4))
plt.plot(fpr, tpr, linewidth=2, label = 'Perceptron')
plt.plot([0, 1], [0,1], 'k--', label = 'baseEstimator')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
auc = roc_auc_score(y_train_0, y_scores)
print('AUC: %.3f' % auc)

## Same model with `LogisticRegression`

###  Training without regularisation

* Set $C = \infty$

In [None]:
pipe_logit = make_pipeline(MinMaxScaler(), LogisticRegression(random_state=1729,
                                                                   solver='lbfgs',
                                                                   C = np.infty))

pipe_logit.fit(x_train, y_train_0)

#### Hyper parameter search

In [None]:
from sklearn.pipeline import Pipeline

grid_Cs = [0, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]

scaler = MinMaxScaler()
logreg = LogisticRegression(C=1.0, random_state=1729)

pipe = Pipeline(steps=[("scaler", scaler),
                        ("logistic", logreg)])

pipe_logit_cv = GridSearchCV(pipe,
                                param_grid={"logistic__C": grid_Cs},
                                scoring='f1')

pipe_logit_cv.fit(x_train, y_train_0)


In [None]:
pipe_logit_cv.best_params_

In [None]:
pipe_logit_cv.best_score_

With `LogisticRegressionCV`

In [None]:
estimator = LogisticRegressionCV(cv =5, scoring='f1', random_state=1729)
logit_cv = make_pipeline(MinMaxScaler(), estimator)
logit_cv.fit(x_train, y_train_0)

###  Performance Evaluation

For 
* Logistic regression without regularisation
* Best logistic regression classifier found through `GridSearchCV`
* Best classifier found through `LogisticRegressionCV`

In [None]:
#Predictions

lr_y_hat_0 = pipe_logit.predict(x_test)
lr_gs_y_hat_0 = pipe_logit_cv.best_estimator_.predict(x_test)
lr_cv_y_hat_0 = logit_cv.predict(x_test)

In [None]:
precision_lr = precision_score(y_test_0, lr_y_hat_0)
recall_lr = recall_score(y_test_0, lr_y_hat_0)

precision_lr_gs = precision_score(y_test_0, lr_gs_y_hat_0)
recall_lr_gs = recall_score(y_test_0, lr_gs_y_hat_0)

precision_lr_cv = precision_score(y_test_0, lr_cv_y_hat_0)
recall_lr_cv = recall_score(y_test_0, lr_cv_y_hat_0)

In [None]:
print(f"LogReg: Precision = {precision_lr}, recall = {recall_lr}")
print(f"GridSearch: Precision = {precision_lr_gs}, recall = {recall_lr_gs}")
print(f"LogRegCV: Precision = {precision_lr_cv}, recall = {recall_lr_cv}")

## Multiclass Logistic regresssion with SGD(OneVsAll)

In [None]:
estimator = SGDClassifier(loss='log',
                            penalty='l2',
                            max_iter=1,
                            warm_start=True,
                            eta0=0.01,
                            alpha=0,
                            learning_rate='constant',
                            random_state=1729)

pipe_sgd_ovr = make_pipeline(MinMaxScaler(), estimator)

In [None]:
Loss = []
iterations = 100
for i in range(iterations):
    pipe_sgd_ovr.fit(x_train, y_train)
    y_pred = pipe_sgd_ovr.predict_proba(x_train)
    Loss.append(log_loss(y_train, y_pred))

In [None]:
plt.figure()
plt.plot(np.arange(iterations), Loss)
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

In [None]:
pipe_sgd_ovr[1]

In [None]:
y_hat = pipe_sgd_ovr.predict(x_test)

In [None]:
cm_display = ConfusionMatrixDisplay.from_predictions(y_test, y_hat, values_format='.5g')
plt.show()

In [None]:
print(classification_report(y_test, y_hat))

## Using Solvers

In [None]:
pipe_logit_ovr = make_pipeline(MinMaxScaler(),
                                LogisticRegression(random_state=1729,
                                                    solver='lbfgs',
                                                    C = np.infty))

pipe_logit_ovr.fit(x_train, y_train)

In [None]:
y_hat = pipe_logit_ovr.predict(x_test)
cm_display = ConfusionMatrixDisplay.from_predictions(y_test, y_hat, values_format='.5g')
plt.show()

In [None]:
print(classification_report(y_test, y_hat))