In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from pathlib import Path
import gzip

In [12]:
data_path = Path("../data/hand-writing/")
train_path = data_path / "train-images-idx3-ubyte.gz"
train_labels_path = data_path / "train-labels-idx1-ubyte.gz"
# test_path = data_path / "t10k-images-idx3-ubyte.gz"
# test_labels_path = data_path / "t10k-labels-idx1-ubyte.gz"

## Modified read_mnist_data (Add PCA parameter)

In [38]:
def read_mnist_data(images_path, labels_path, num_images, shuffle=False, _is=True,image_size=28, pca=False, n_components=None):
    f_images = gzip.open(images_path, 'r')
    # skip 16 first bytes because these are not data, only header infor
    f_images.read(16)
    # general: read num_images data samples if this parameter is set;
    # if not, read all (60000 training or 10000 test)
    real_num = num_images if not shuffle else (60000 if _is else 10000)
    # read all data to buf_images (28x28xreal_num)
    buf_images = f_images.read(image_size * image_size * real_num)
    # images
    images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
    images = images.reshape(real_num, image_size, image_size,)
    # Read labels
    f_labels = gzip.open(labels_path, 'r')
    f_labels.read(8)
    labels = np.zeros((real_num)).astype(np.int64)

    for i in range(0, real_num):
        buf_labels = f_labels.read(1)
        labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)

    if shuffle is True:
        rand_id = np.random.randint(real_num, size=num_images)
        images = images[rand_id, :]
        labels = labels[rand_id,]

    images = images.reshape(num_images, image_size * image_size)

    if (pca == True):
        pca = PCA(n_components=n_components)
        images = pca.fit_transform(images)

    return images, labels


In [33]:
images0, labels0 = read_mnist_data(train_path, train_labels_path, 5000, shuffle=True)
train_images0, test_images0, train_labels0, test_labels0 = train_test_split(images0, labels0, test_size=0.3, random_state=192)
# test_images0, test_labels0 = read_mnist_data(test_path, test_labels_path, 10000, _is=False, shuffle=True)

print(train_images0.shape)
print(test_labels0.shape)

(3500, 784)
(1500,)


  labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)


## Without PCA

In [34]:
model0 = LogisticRegression(multi_class="multinomial", max_iter=1000, solver="sag")
model0.fit(train_images0, train_labels0)



In [35]:
predict0 = model0.predict(test_images0)
predict0

array([0, 5, 5, ..., 8, 0, 4])

In [36]:
acc0 = accuracy_score(test_labels0, predict0)
cm0 = confusion_matrix(test_labels0, predict0)

print(acc0)
print(cm0)

0.872
[[138   0   1   1   1   4   2   1   1   0]
 [  0 171   1   2   0   2   0   0   2   0]
 [  0   6 119   6   3   3   3   2   3   0]
 [  0   1   7 111   0   9   2   2   5   2]
 [  1   2   2   0 125   1   0   1   2   8]
 [  5   1   0   7   3 102   3   1   5   1]
 [  3   2   1   0   3   2 135   0   0   1]
 [  1   1   5   1   0   0   0 154   5   3]
 [  2   3   5   5   5   9   0   1 111   1]
 [  1   1   1   3   5   1   0   6   0 142]]


## With PCA

In [39]:
pca_images, labels1 = read_mnist_data(train_path, train_labels_path, 5000, shuffle=True, pca=True, n_components=100)
# test_images1, test_labels1 = read_mnist_data(test_path, test_labels_path, 10000, _is=False, shuffle=True, pca=True, n_components

train_images1, test_images1, train_labels1, test_labels1 = train_test_split(pca_images, labels1, test_size=0.3, random_state=192)

print(train_images1.shape)
print(test_labels1.shape)

  labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)


(3500, 100)
(1500,)


In [40]:
model1 = LogisticRegression(multi_class="multinomial", max_iter=1000, solver="sag")
model1.fit(train_images1, train_labels1)



In [41]:
predict1 = model1.predict(test_images1)
predict1

array([7, 8, 5, ..., 6, 0, 7])

In [42]:
acc1 = accuracy_score(test_labels1, predict1)
cm1 = confusion_matrix(test_labels1, predict1)

print(acc1)
print(cm1)

0.8446666666666667
[[141   0   0   1   0   0   3   1   0   0]
 [  0 159   0   0   0   2   0   0   5   0]
 [  4   4 102   6   0   2   8   0   3   0]
 [  0   3   3 125   0   9   4   3   6   1]
 [  0   5   4   1 131   1   3   3   2  13]
 [  3   1   0   6   3  94   5   0   3   4]
 [  1   1   3   4   3   5 141   1   1   1]
 [  1   1   2   0   1   0   1 145   2   7]
 [  7   6   2   6   2   5   3   2  99   4]
 [  3   1   1   6   5   2   0  12   6 130]]


In [45]:
print(f"Accuracy without PCA: {acc0}\nAccuracy with PCA: {acc1}")
print(f"Confusion matrix without PCA:\n {cm0}\nConfusion matrix with PCA:\n {cm1}")

Accuracy without PCA: 0.872
Accuracy with PCA: 0.8446666666666667
Confusion matrix without PCA:
 [[138   0   1   1   1   4   2   1   1   0]
 [  0 171   1   2   0   2   0   0   2   0]
 [  0   6 119   6   3   3   3   2   3   0]
 [  0   1   7 111   0   9   2   2   5   2]
 [  1   2   2   0 125   1   0   1   2   8]
 [  5   1   0   7   3 102   3   1   5   1]
 [  3   2   1   0   3   2 135   0   0   1]
 [  1   1   5   1   0   0   0 154   5   3]
 [  2   3   5   5   5   9   0   1 111   1]
 [  1   1   1   3   5   1   0   6   0 142]]
Confusion matrix with PCA:
 [[141   0   0   1   0   0   3   1   0   0]
 [  0 159   0   0   0   2   0   0   5   0]
 [  4   4 102   6   0   2   8   0   3   0]
 [  0   3   3 125   0   9   4   3   6   1]
 [  0   5   4   1 131   1   3   3   2  13]
 [  3   1   0   6   3  94   5   0   3   4]
 [  1   1   3   4   3   5 141   1   1   1]
 [  1   1   2   0   1   0   1 145   2   7]
 [  7   6   2   6   2   5   3   2  99   4]
 [  3   1   1   6   5   2   0  12   6 130]]
