# Ridge Classifier

* Ridge classifier cast the problem as least-square classification and finds the optimal
weight using some matrix decomposition technique such as SVD.
* To train the ridge classifier, the labels should be {+1, -1}.
* The classifier also by default implements L2 regularization. However, we first implement it
without regularization by setting alpha=0

# SKIP

In [1]:
# Imports:

import numpy as np
from pprint import pprint
from tempfile import mkdtemp
from shutil import rmtree

# to make this notebook output stable across the runs:
np.random.seed(42)

# sklearn specific imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.model_selection import cross_validate, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

# scipy
from scipy.stats import loguniform

# to plot the pretty figures
%matplotlib inline
import matplotlib.pyplot as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# global settings
mpl.rc("axes", labelsize = 14)
mpl.rc('xtick', labelsize = 12)
mpl.rc('ytick', labelsize = 12)
mpl.rc('figure', figsize = (8,6))


In [2]:
# Ignore all the warnings (convergence ...) by sklearn

def warn(*args, **kwargs):
  pass

import warnings
warnings.warn = warn

In [3]:
from sklearn.datasets import fetch_openml
X_pd, y_pd = fetch_openml('mnist_784', version = 1, return_X_y = True)

In [4]:
# convert to numpy array

X = X_pd.to_numpy()
y = y_pd.to_numpy()

In [5]:
# Split the dataset into training and testing set.

x_train,x_test,y_train,y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# intialize new variable names with all -1

y_train_0 = np.zeros((len(y_train)))
y_test_0 = np.zeros((len(y_test)))

In [7]:
# find indices of digit 0 image
indx_0 = np.where(y_train =='0')

In [8]:
# use those indices to modify y_train_0&y_test_0
y_train_0[indx_0] = 1
indx_0 = np.where(y_test == '0')
y_test_0[indx_0] = 1

* First take a look into the parameters of the class
```
RidgeClassifier(alpha=1.0, *, fit_intercept=True,
normalize='deprecated', copy_X=True, max_iter=None, tol=0.001,
class_weight=None, solver='auto', positive=False,
random_state=None)
```
* Note the parameter "normalize" is depreceated.

In [12]:
estimator = RidgeClassifier(alpha=0)
pipe_ridge = make_pipeline(MinMaxScaler(),estimator)
pipe_ridge.fit(x_train,y_train_0)

In [14]:
# performance

y_hat_test_0 = pipe_ridge.predict(x_test)
print(classification_report(y_test_0,y_hat_test_0))

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      9020
         1.0       0.95      0.88      0.91       980

    accuracy                           0.98     10000
   macro avg       0.97      0.94      0.95     10000
weighted avg       0.98      0.98      0.98     10000



# Cross Validation

In [15]:
cv_bin_ridge_clf = cross_validate(pipe_ridge,
                                  x_train, 
                                  y_train_0, 
                                  cv=5,
                                  scoring=['precision', 'recall', 'f1'],
                                  return_train_score=True,
                                  return_estimator=True)

pprint(cv_bin_ridge_clf)

{'estimator': [Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('ridgeclassifier', RidgeClassifier(alpha=0))]),
               Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('ridgeclassifier', RidgeClassifier(alpha=0))]),
               Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('ridgeclassifier', RidgeClassifier(alpha=0))]),
               Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('ridgeclassifier', RidgeClassifier(alpha=0))]),
               Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('ridgeclassifier', RidgeClassifier(alpha=0))])],
 'fit_time': array([19.37446809, 16.25350451, 22.19326687, 35.01197505, 15.46323895]),
 'score_time': array([0.16215324, 0.14446044, 0.49054337, 0.1531713 , 0.13908577]),
 'test_f1': array([0.91887202, 0.9031402 , 0.91507337, 0.90552585, 0.90917186]),
 'test_precision': array([0.94469224, 0.94800371, 0.96710526, 0.95939566, 0.96229972]),
 'test_re

In [16]:
best_estimator_id = np.argmax(cv_bin_ridge_clf['train_f1']); best_estimator_id

best_estimator = cv_bin_ridge_clf['estimator'][best_estimator_id]

Let's evaluate the performance of the best classifier on the test set:

In [17]:
y_hat_test_0 = best_estimator.predict(x_test)
print(classification_report(y_test_0,y_hat_test_0))

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      9020
         1.0       0.95      0.88      0.92       980

    accuracy                           0.98     10000
   macro avg       0.97      0.94      0.95     10000
weighted avg       0.98      0.98      0.98     10000



# further exploration

In [18]:
# chek the composite file
models = (pipe_sgd, pipe_sgd_l2, pipe_logit, pipe_ridge)
titles = ('sgd', 'regularized sgd', 'logit', 'ridge')
plt.figure(figsize=(4, 4))
plt.subplots(2, 2)
for i in range(0, 4):
  w = models[i][1].coef_
  w_matrix = w.reshape(28, 28)
  w_matrix[w_matrix < 0]=0 #just set the value less than zero to zero
  plt.subplot(2, 2, i+1)
  plt.imshow(w_matrix, cmap='gray')
  plt.title(titles[i])
  plt.axis('off')
  plt.grid(False)
fig.show()

NameError: ignored