In [1]:
# Common imports
import numpy as np
import os
import io
import warnings

#sklearn specific imports
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import hinge_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, precision_recall_curve
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate, cross_val_predict,GridSearchCV
from pprint import pprint
from sklearn.decomposition import PCA

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns

#global matplotlib settings
mpl.rc('figure',figsize=(8,6))
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# to make this notebook's output stable across runs
np.random.seed(42)

In [2]:
## Following definition helps us supress some warning messages. (Warning: we are purposefully
## supressing the warnings, not a good idea in general!).

# Ignore all warnings (like convergence..) by sklearn
def warn(*args, **kwargs):
  pass
warnings.warn = warn

In [3]:
X,y= fetch_openml('mnist_784',version=1,return_X_y=True)
#it returns Data and label as a pandas dataframe

  warn(


In [4]:
X = X.to_numpy()
y = y.to_numpy()

In [5]:
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [22]:
# Access the sample at the index 2022.
X[2022,:]

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   

In [21]:
# * What is the label of the sample?
y[2022]

'2'

In [24]:
# What is mean value of the sample?
X[2022,:].mean()

27.761479591836736

In [47]:
# How many zeros are there in the sample?

unique, counts = np.unique(X[2022,:], return_counts=True)
dict(zip(unique, counts))

{0.0: 643,
 1.0: 2,
 2.0: 1,
 3.0: 1,
 6.0: 5,
 10.0: 2,
 11.0: 1,
 14.0: 3,
 18.0: 1,
 20.0: 1,
 27.0: 1,
 29.0: 2,
 31.0: 1,
 36.0: 1,
 38.0: 1,
 43.0: 1,
 48.0: 3,
 53.0: 1,
 57.0: 1,
 60.0: 4,
 64.0: 2,
 65.0: 3,
 71.0: 1,
 72.0: 1,
 73.0: 1,
 78.0: 1,
 79.0: 2,
 81.0: 1,
 84.0: 1,
 87.0: 1,
 89.0: 1,
 99.0: 4,
 100.0: 1,
 108.0: 2,
 109.0: 1,
 119.0: 1,
 131.0: 1,
 133.0: 1,
 137.0: 1,
 138.0: 1,
 141.0: 1,
 149.0: 1,
 151.0: 1,
 153.0: 1,
 155.0: 1,
 169.0: 2,
 170.0: 1,
 175.0: 1,
 176.0: 1,
 185.0: 1,
 188.0: 1,
 198.0: 1,
 201.0: 1,
 205.0: 2,
 206.0: 2,
 207.0: 3,
 209.0: 1,
 218.0: 1,
 223.0: 2,
 224.0: 1,
 225.0: 1,
 227.0: 4,
 229.0: 1,
 230.0: 2,
 237.0: 1,
 239.0: 2,
 240.0: 2,
 242.0: 2,
 245.0: 1,
 246.0: 1,
 249.0: 2,
 251.0: 1,
 253.0: 2,
 254.0: 32}

In [48]:
# Create a dataset by taking first 10000 images for training and next 2000 images for testing from the original dataset. Answer the following questions.

x_train,x_test,y_train,y_test = X[:10000],X[10000:12000],y[:10000],y[10000:12000]

In [57]:
# How many unique classes are there in our train dataset: 
len(np.unique(y_train))

10

In [61]:
# number of training samples for the digit 6 are
idx = np.where(y_train == '6')
len(idx[0])

1014

In [62]:
# number of training samples for the digit 9 are
idx = np.where(y_train == '9')
len(idx[0])

978

In [64]:
# How many unique classes are there in our test dataset: 
len(np.unique(y_test))

10

In [69]:
# Which class has more number of test samples?
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{'0': 205,
 '1': 224,
 '2': 185,
 '3': 196,
 '4': 204,
 '5': 185,
 '6': 194,
 '7': 209,
 '8': 183,
 '9': 215}

In [72]:
# Collect all digit-6 (Positive class) and digit-9 (Negative class) images and stack them properly as a single datamatrix.
# By convention, keep all digit-6 images from index 0 to i followed by digit-9 images from index i+1 to n (i denotes the end index of digit-6 images)
# Similarly, collect the respective labels and store it in a variable (Do sanity check).
# Set the label values to 1 for positive classes and -1 for negative classes.

idx6_train = np.where(y_train == '6')
idx9_train = np.where(y_train == '9')
idx6_test = np.where(y_train == '6')
idx9_test = np.where(y_train == '9')

In [74]:
train_idx = np.concatenate((idx6_train, idx9_train), axis = None)
test_idx = np.concatenate((idx6_test, idx9_test), axis = None)

In [86]:
X_train_mod = X[train_idx]
X_test_mod = X[test_idx]

In [88]:
len(idx6_train[0])

1014

In [90]:
len(idx9_train[0])

978

In [92]:
y_train_mod = np.concatenate(((1*np.ones(len(idx6_train[0]))),(-1*np.ones(len(idx9_train[0])))), axis = None)
y_test_mod = np.concatenate(((1*np.ones(len(idx6_test[0]))),(-1*np.ones(len(idx9_test[0])))), axis = None)

In [95]:
from sklearn.utils import shuffle
from scipy.sparse import coo_matrix

X_sparse = coo_matrix(X_train_mod)

X_s, X_sparse_s, y_s = shuffle(X_train_mod, X_sparse, y_train_mod, random_state=1729)

In [100]:
# what are the first three labels starting from the index 0?. Select from the following options,
y_s[:3]

array([ 1., -1., -1.])

In [109]:
from sklearn.linear_model import Perceptron
clf = Perceptron(random_state = 1729,
                 eta0 = 1,
                 max_iter = 10,
                 shuffle = False,
                 fit_intercept = True,
                 penalty = None, warm_start= True)
clf.fit(X_s, y_s)

In [110]:
clf.coef_[0,69]

605.0

In [114]:
clf.intercept_

array([-6.])

In [118]:
from sklearn.linear_model import Perceptron
for i in range(1,6):
  clf = Perceptron(random_state = 1729,
                  eta0 = 1,
                  max_iter = i,
                  shuffle = False,
                  fit_intercept = True,
                  penalty = None, warm_start= True)
  clf.fit(X_s, y_s)
  print(clf.intercept_)

[-1.]
[-4.]
[-4.]
[-6.]
[-5.]
