In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix

import pickle

In [2]:
# method used for saving object as pickle
def save_object_as_pickle(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)

In [3]:
mnist = fetch_openml('mnist_784', version=1)

In [4]:
print((np.array(mnist.data.loc[42]).reshape(28, 28) > 0).astype(int))

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 

In [5]:
print(type(mnist))

<class 'sklearn.utils.Bunch'>


In [6]:
X, y = mnist["data"], mnist["target"].astype(np.uint8)

In [7]:
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Columns: 784 entries, pixel1 to pixel784
dtypes: float64(784)
memory usage: 418.7 MB
None


In [8]:
print(y.describe())

count    70000.000000
mean         4.452429
std          2.890195
min          0.000000
25%          2.000000
50%          4.000000
75%          7.000000
max          9.000000
Name: class, dtype: float64


In [9]:
y_sorted = y.sort_values()

In [10]:
print(y_sorted.index)

Int64Index([34999, 56424, 56419, 16705, 56415, 56404, 56397, 56389, 56388,
            56429,
            ...
            13698, 33531, 13695, 13692, 58898, 13687, 42651, 58914, 13678,
            58529],
           dtype='int64', length=70000)


In [11]:
print(y_sorted)

34999    0
56424    0
56419    0
16705    0
56415    0
        ..
13687    9
42651    9
58914    9
13678    9
58529    9
Name: class, Length: 70000, dtype: uint8


In [12]:
X_sorted = X.reindex(y)

In [13]:
print(X_sorted)

       pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
class                                                                           
5         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
9         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
2         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
6         0.0     0.0     0.

In [14]:
X_train, X_test = X_sorted[:56000], X_sorted[56000:]
y_train, y_test = y_sorted[:56000], y_sorted[56000:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(56000, 784) (56000,)
(14000, 784) (14000,)


In [15]:
print("y_train:", np.unique(y_train))
print("y_test:", np.unique(y_test))

y_train: [0 1 2 3 4 5 6 7]
y_test: [7 8 9]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
print("y_train: ", np.unique(y_train))
print("y_test: ", np.unique(y_test))

y_train:  [0 1 2 3 4 5 6 7 8 9]
y_test:  [0 1 2 3 4 5 6 7 8 9]


In [18]:
sgd_clf1 = SGDClassifier()
sgd_clf2 = SGDClassifier()

y_train_0 = (y_train == 0)
y_test_0 = (y_test == 0)

In [19]:
print(np.unique(y_train_0))

[False  True]


In [20]:
sgd_clf1.fit(X_train, y_train_0)

SGDClassifier()

In [21]:
train_acc = sgd_clf1.score(X_train, y_train_0)
test_acc = sgd_clf1.score(X_test, y_test_0)

print("train_acc", train_acc)
print("test_acc", test_acc)
acc_list = list([train_acc, test_acc])
print(acc_list)

# save acc_list as pickle
save_object_as_pickle(acc_list, "sgd_acc.pkl")

train_acc 0.9918392857142857
test_acc 0.9916428571428572
[0.9918392857142857, 0.9916428571428572]


In [22]:
y_pred = sgd_clf1.predict(X_test)

cv_score = cross_val_score(sgd_clf1, X_train, y_train_0, cv=3, scoring="accuracy", n_jobs=-1)
print(cv_score)

# save cv_score as pickle
save_object_as_pickle(cv_score, "sgd_cva.pkl")

[0.98537526 0.98516098 0.98778528]


In [23]:
sgd_clf2.fit(X_train, y_train)
y_pred = sgd_clf2.predict(X_test)

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# save conf_mat as pickle
save_object_as_pickle(conf_mat, "sgd_cmx.pkl")

[[1350    0    4    3    4    2   10    1    4    2]
 [   0 1480    6   18    0    5    7    5   14    2]
 [  16   18 1127   35   21    4   61   50   56    1]
 [   9    5   41 1281    3   32    7   41   15   18]
 [   1    5    3    3 1212   10   23   82    4   25]
 [  22    4    6   65   22  997   73   24   25    6]
 [   4    3    2    0   10   13 1345    5    3    1]
 [   2    5    6    4    6    3    1 1409    1    9]
 [   9   20    6   83   27  151   31  111  905   47]
 [   6    3    3   14   45   29    0  554    6  748]]


In [24]:
# check if pickles are saved correctly
print("acc_list", pd.read_pickle("sgd_acc.pkl"))
print("cs_score", pd.read_pickle("sgd_cva.pkl"))
print("conf_mat\n", pd.read_pickle("sgd_cmx.pkl"))

acc_list [0.9918392857142857, 0.9916428571428572]
cs_score [0.98537526 0.98516098 0.98778528]
conf_mat
 [[1350    0    4    3    4    2   10    1    4    2]
 [   0 1480    6   18    0    5    7    5   14    2]
 [  16   18 1127   35   21    4   61   50   56    1]
 [   9    5   41 1281    3   32    7   41   15   18]
 [   1    5    3    3 1212   10   23   82    4   25]
 [  22    4    6   65   22  997   73   24   25    6]
 [   4    3    2    0   10   13 1345    5    3    1]
 [   2    5    6    4    6    3    1 1409    1    9]
 [   9   20    6   83   27  151   31  111  905   47]
 [   6    3    3   14   45   29    0  554    6  748]]
