<a href="https://colab.research.google.com/github/perceptronnn/shared_notebooks/blob/master/mnist_5nn_dectree_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting MNIST dataset
Referred: https://pytorch.org/tutorials/beginner/nn_tutorial.html

In [24]:
from pathlib import Path
import requests

In [25]:
DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"

PATH.mkdir(parents=True, exist_ok=True)

In [3]:
URL = "http://deeplearning.net/data/mnist/"
FILENAME = "mnist.pkl.gz"

In [4]:
if not (PATH / FILENAME).exists():
        content = requests.get(URL + FILENAME).content
        (PATH / FILENAME).open("wb").write(content)

In [5]:
import pickle
import gzip

with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")

In [6]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

((50000, 784), (50000,), (10000, 784), (10000,))

In [7]:
type(x_train)

numpy.ndarray

In [8]:
y_train[0:5]

array([5, 0, 4, 1, 9])

In [31]:
import numpy as np

def get_accuracy(y_pred, y_actual = y_valid):
  diff_count = np.count_nonzero(y_actual - y_pred)
  return 100 - (diff_count * 100/ len(y_pred))

# 5-Nearest Neighbor


In [11]:
from sklearn import neighbors

In [12]:
%%time

clf_5nn = neighbors.KNeighborsClassifier()
clf_5nn.fit(x_train, y_train)

CPU times: user 25.2 s, sys: 161 ms, total: 25.3 s
Wall time: 25.3 s


In [13]:
%%time

y_pred_5nn = clf_5nn.predict(x_valid)

CPU times: user 14min 25s, sys: 239 ms, total: 14min 25s
Wall time: 14min 26s


In [32]:
accuracy_5nn = get_accuracy(y_pred_5nn)
print(accuracy_5nn)

97.18


# Decision Tree

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
%%time

clf_dt = DecisionTreeClassifier(random_state = 0)
clf_dt.fit(x_train, y_train)

CPU times: user 23.2 s, sys: 2.99 ms, total: 23.2 s
Wall time: 23.2 s


In [17]:
%%time

y_pred_dt = clf_dt.predict(x_valid)

CPU times: user 58.2 ms, sys: 0 ns, total: 58.2 ms
Wall time: 62.8 ms


In [33]:
accuracy_dt = get_accuracy(y_pred_dt)
print(accuracy_dt)

88.11


# PCA 
Referred https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [19]:
from sklearn.decomposition import PCA

In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [41]:
%%time 

scaler.fit(x_train)

CPU times: user 531 ms, sys: 4 ms, total: 535 ms
Wall time: 539 ms


StandardScaler(copy=True, with_mean=True, with_std=True)

In [51]:
%%time 

pca90 = PCA(.90)
pca90.fit(x_train)

pca95 = PCA(.95)
pca95.fit(x_train)

pca99 = PCA(.99)
pca99.fit(x_train)

CPU times: user 30.7 s, sys: 1.64 s, total: 32.3 s
Wall time: 17.4 s


In [53]:
print(len(pca90.explained_variance_ratio_))
print(len(pca95.explained_variance_ratio_))
print(len(pca99.explained_variance_ratio_))

87
154
331


In [54]:
print(pca95.explained_variance_ratio_)

[0.09744383 0.07059848 0.06216133 0.05379485 0.04858056 0.04319614
 0.03277574 0.02886385 0.02768836 0.02365931 0.0209952  0.02020407
 0.01715791 0.01681795 0.01579267 0.01492753 0.01318167 0.01276553
 0.01186514 0.0115102  0.01069107 0.01007499 0.00953629 0.00907861
 0.00882788 0.0083695  0.00817541 0.00784796 0.00742014 0.00690706
 0.00658563 0.00643653 0.00602719 0.00585303 0.00569431 0.00541853
 0.00507309 0.00488394 0.00482054 0.0047335  0.00455972 0.00441411
 0.00416428 0.00396221 0.00385332 0.00374064 0.00361329 0.00350469
 0.0034027  0.00321046 0.00318202 0.00310987 0.00294572 0.00288291
 0.00285743 0.00270759 0.00269223 0.00256373 0.00253896 0.00243535
 0.00240497 0.00237759 0.0022913  0.0022173  0.00212807 0.00206764
 0.00203552 0.00196812 0.00192869 0.00188608 0.00187148 0.0018114
 0.00177256 0.00174636 0.00164976 0.00163645 0.0016148  0.00153505
 0.00147434 0.00143104 0.00141426 0.00140905 0.00139826 0.00135518
 0.00133949 0.00132057 0.00129865 0.00125902 0.00122278 0.00121

In [55]:
print(pca95.explained_variance_ratio_.cumsum())

[0.09744383 0.1680423  0.23020363 0.2839985  0.33257905 0.3757752
 0.40855092 0.43741477 0.46510312 0.48876244 0.50975764 0.5299617
 0.5471196  0.56393754 0.5797302  0.5946577  0.6078394  0.62060493
 0.6324701  0.64398026 0.6546713  0.6647463  0.67428255 0.6833612
 0.69218904 0.70055854 0.708734   0.71658194 0.72400206 0.7309091
 0.73749477 0.7439313  0.74995846 0.7558115  0.76150584 0.7669244
 0.77199745 0.7768814  0.7817019  0.7864354  0.7909951  0.7954092
 0.7995735  0.8035357  0.807389   0.8111297  0.814743   0.8182477
 0.8216504  0.8248609  0.82804286 0.83115274 0.83409846 0.83698136
 0.8398388  0.8425464  0.8452386  0.84780234 0.8503413  0.85277665
 0.85518163 0.8575592  0.8598505  0.8620678  0.8641959  0.8662635
 0.868299   0.87026715 0.87219584 0.8740819  0.8759534  0.87776476
 0.87953734 0.8812837  0.88293344 0.8845699  0.8861847  0.88771975
 0.8891941  0.8906251  0.89203936 0.8934484  0.8948467  0.89620185
 0.89754134 0.89886194 0.9001606  0.90141964 0.9026424  0.90385354
 0.