In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

**Objective**

For this lab, we're going to see if we can apply PCA on the MNIST dataset to reduce the feature space dimensionality, and still get good performance.

---

We begin by loading the MNIST-dataset, just as we did in the MNIST-lab.

In [None]:
import struct

def load_mnist_images(filename):
    with open(filename, 'rb') as f:
        _, num, rows, cols = struct.unpack(">IIII", f.read(16))
        return np.fromfile(f, dtype=np.uint8).reshape(num, rows, cols)

def load_mnist_labels(filename):
    with open(filename, 'rb') as f:
        _, num = struct.unpack(">II", f.read(8))
        return np.fromfile(f, dtype=np.uint8)

# Load training data
train_images = load_mnist_images("../data/mnist/train-images.idx3-ubyte")
train_labels = load_mnist_labels("../data/mnist/train-labels.idx1-ubyte")

# Load test data
test_images = load_mnist_images("../data/mnist/t10k-images.idx3-ubyte")
test_labels = load_mnist_labels("../data/mnist/t10k-labels.idx1-ubyte")

Let's plot an randome image

In [None]:
num_images = len(train_images)

random_index = np.random.randint(0, num_images)

plt.imshow(train_images[random_index], cmap='gray')
plt.axis('off')
plt.show()

Note that ALOT of the pixels are just completely black (grayscale value of 0), and don't actually add any relevant information at all.

*Hyphotesis*: PCA should, while identifying the important pieces of information and reducing dimensionality, get rid of precisely the reduntant information provided by these pixels.

---

Let's transform our data into dataframes, and scale them (important for PCA)

In [None]:
train_images_flattened = [list(image.flatten()) for image in train_images]

train_images_flattened_array = np.array([image.flatten() for image in train_images])

X_train = pd.DataFrame(train_images_flattened_array)

X_train = X_train / 255                                                       # divide by the maximum grayscale value

X_train

In [None]:
y_train = pd.DataFrame(train_labels)

y_train = np.array(y_train)

y_train

In [None]:
test_images_flattened = [list(image.flatten()) for image in test_images]

test_images_flattened_array = np.array([image.flatten() for image in test_images])

X_test = pd.DataFrame(test_images_flattened_array)

X_test = X_test / 255

X_test

In [None]:
y_test = pd.DataFrame(test_labels)

y_test = np.array(y_test)

y_test

---

Let's now define our PCA. 

**Important** To avoid data leagake, we must fit our PCA to only the train data, we then transform both the train and test data with what's been fitted on the train data.

In [None]:
from sklearn.decomposition import PCA

Make a guess as to how many dimensions we should reduce our feature space to.

*hint*: take a look at the plots above. How many pixels do you think are reduntant (useless)?

In [None]:
number_of_principal_components = 700          # your guess here

In [None]:
pca = PCA(n_components=number_of_principal_components)

# fit our PCA to the train_val set
pca_transformer = pca.fit(X_train)

In [None]:
# transform both train_val and test sets using the fitted transformer
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)


# convert the reduced dimensions to dataframes
X_train_reduced = pd.DataFrame(X_train_pca)
X_test_reduced = pd.DataFrame(X_test_pca)


In [None]:
X_train_reduced

---

Train a (for example) KNeighborsClassifier, using the newly optained principle components as features

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)

---

Evaluate the trained model on the test set

In [None]:
knn_pred = knn.predict(X_test_reduced)

In [None]:
cm = confusion_matrix(y_true=y_test, y_pred=knn_pred.reshape(-1,1))

cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

cm_display.plot()
plt.show() 

In [None]:
accuracy = accuracy_score(y_test, knn_pred)
print("Accuracy:", accuracy, end='\n\n')

report = classification_report(y_test, knn_pred)
print("Classification Report:\n", report)

---

## Challenges

**Task 1**

Now you go ahead and try different values for the number of dimensions to reduce your feature space to. 

How low can you reduce the feature space dimensionality, and still get good performance?

Plot accuracy as a function of the number of principle components. Does the result make sense?

In [None]:
accuracies = []

number_of_principal_components = [1,2,3,5,10,15,20,30,50,70,100,150,250,400,550,700]

for number in number_of_principal_components:

    pca = PCA(n_components=number)

    # fit our PCA to the train_val set
    pca_transformer = pca.fit(X_train)

    # transform both train_val and test sets using the fitted transformer
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    # convert the reduced dimensions to dataframes
    X_train_reduced = pd.DataFrame(X_train_pca)
    X_test_reduced = pd.DataFrame(X_test_pca)

    knn = KNeighborsClassifier()
    knn.fit(X_train_reduced, y_train)

    accuracy = accuracy_score(y_test, knn_pred)

    knn_pred = knn.predict(X_test_reduced)

    accuracies.append(accuracy)

In [None]:
for pc, acc in zip(number_of_principal_components, accuracies):
    print(f"Number of Principal Components: {pc}, Accuracy: {acc}")

plt.scatter(number_of_principal_components[:], accuracies[:])
plt.xlabel('Number of Principle Components')
plt.ylabel('Total accuracy')