In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [2]:
print(type(mnist.data))
print(type(mnist.target))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [3]:
digit = mnist["data"][2].reshape(28,28)
digit = digit > 0
print(digit.astype(int))

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
 [0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
 [0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0]
 [0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0]
 [0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 

In [4]:
X, y = mnist["data"], mnist["target"].astype(np.uint8)
print(X.shape, y.shape)

(70000, 784) (70000,)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)


In [6]:
y_train_0 = (y_train == 0)
y_test_0 = (y_test == 0)
print(y_train_0)
print(np.unique(y_train_0))
print(len(y_train_0))

[False False False ... False  True  True]
[False  True]
56000


In [7]:
from sklearn.linear_model import SGDClassifier
import time

start = time.time()
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_0)

print(time.time() - start)

4.423332691192627


In [8]:
print(sgd_clf.predict([mnist["data"][0],
                      mnist["data"][1]]))

[False  True]


In [9]:
y_train_pred = sgd_clf.predict(X_train)
y_test_pred = sgd_clf.predict(X_test)

acc_train = sum(y_train_pred == y_train_0) / len(y_train_0)
acc_test = sum(y_test_pred == y_test_0) / len(y_test_0)

print(acc_train, acc_test)

0.9918571428571429 0.9903571428571428


In [10]:
import pickle

wyniki = [acc_train, acc_test]

with open('sgd_acc.pkl', 'wb') as file:
    pickle.dump(wyniki, file)

In [11]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(sgd_clf, X_train, y_train_0, cv=3, scoring="accuracy", n_jobs=-1)
print(score)

[0.98687523 0.98762522 0.98649952]


In [12]:
with open('sgd_cva.pkl', 'wb') as file:
    pickle.dump(np.array(score), file)

In [13]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
print(svm_clf.classes_)

[0 1 2 3 4 5 6 7 8 9]


In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

sgd_m_clf = SGDClassifier(random_state=42, n_jobs=-1, max_iter=2000)
sgd_m_clf.fit(X_train_scaled, y_train)

print(sgd_m_clf.predict([mnist["data"][0],
                         mnist["data"][1]]))

[3 0]


In [15]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

print(cross_val_score(sgd_m_clf, X_train_scaled, y_train, cv=3, scoring="accuracy", n_jobs=-1))

y_train_pred = cross_val_predict(sgd_m_clf, X_train_scaled, y_train, cv=3, n_jobs=-1)

conf_mx = confusion_matrix(y_train, y_train_pred)
print(conf_mx)

[0.91000161 0.90625167 0.90110361]
[[5289    0   15    8    8   38   40    6  155    1]
 [   1 5996   42   15    3   35    8   10  156   11]
 [  26   28 4980   79   65   26   56   43  291   16]
 [  23   18  117 4946    2  187   23   38  294   60]
 [  11   17   41    6 4985   12   43   27  216  171]
 [  37   16   26  147   45 4212   83   21  381   72]
 [  29   17   54    2   41   88 5133    4  112    0]
 [  18   16   64   20   44   10    5 5281  107  225]
 [  20   59   44   80    3  127   28    7 5047   53]
 [  25   17   24   59  128   35    1  149  245 4855]]


In [16]:
with open('sgd_cmx.pkl', 'wb') as file:
    pickle.dump(np.array(conf_mx), file)