### Import Libraries and Data

In [None]:
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm.notebook import tqdm

In [2]:
mat_contents = scipy.io.loadmat('mnist', appendmat=True)
train_x = np.array(mat_contents['trainX'])
train_y = np.array(mat_contents['trainY'])
test_x = np.array(mat_contents['testX'])
test_y = np.array(mat_contents['testY'])

### Helper Functions (normalization, error, confusion matrices, etc.)

In [3]:
def normalize(data):
    return data / 255.0

def pinv(data):
    return np.linalg.pinv(data)

def add_bias(data):
    return np.column_stack((data, np.ones(len(data))))

In [4]:
def plot_confusion_matrix(conf_matrix, classes, title='Confusion Matrix', cmap=plt.cm.Greens):
    plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    fmt = 'd'
    thresh = conf_matrix.max() / 2.
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            plt.text(j, i, format(conf_matrix[i, j], fmt),
                     ha="center", va="center",
                     color="white" if conf_matrix[i, j] > thresh else "black")
    plt.tight_layout()
    plt.show()

def accuracy_score(y_true, y_pred):
    return 100 * (sum(1 for true, pred in zip(y_true, y_pred) if true == pred)) / len(y_true)

def calculate_accuracy_per_class(y_true, y_pred):
    unique_classes = np.unique(y_true)
    accuracies = []

    for class_label in unique_classes:
        class_indices = (y_true == class_label)
        
        # Flatten y_pred if it has shape (N, 1)
        flattened_pred = np.squeeze(y_pred)
        
        correct_predictions = np.sum(flattened_pred[class_indices.flatten()] == class_label)
        total_predictions = np.sum(class_indices)
        
        if total_predictions > 0:
            accuracy = correct_predictions / total_predictions
            accuracies.append((class_label, accuracy))
        else:
            # Handle the case where there are no instances of the class in y_true
            accuracies.append((class_label, 0.0))

    # Sort accuracies based on accuracy in descending order
    accuracies.sort(key=lambda x: x[1], reverse=True)

    for class_label, accuracy in accuracies:
        print("{}: {:.2%} accurate".format(class_label, accuracy))

### Computing Gradients

A more general form for $f_{\mathbf{w}}(\mathbf{x})$ provided in (2) is

$$ f_{\mathbf{w}}(\mathbf{x}) = \sum_{i = 1}^k \alpha_i \phi (a_i^Tx + b_i) + \beta \qquad \forall x \in \mathbb{R}^n $$

where $\phi(x): \mathbb{R} \rightarrow \mathbb{R}$ is an arbitrary activation function. The set of parameters $\mathbf{w} \in \mathbb{R}^p$ is defined as

$$\mathbf{w} = \{ \alpha_1, \cdots, \alpha_k, \beta, a_1, \cdots, a_k, b_1, \cdots, b_k\}$$

where $p = (n + 1)k + k + 1$. Lastly, the let the individual parameters $\alpha_i \in \mathbb{R}$, $\beta \in \mathbb{R}$, $a_i \in \mathbb{R}^n$, $b_i \in \mathbb{R}$. In (2), we have $n = k = 3$ and $p = 16$.

By computing partial derivatives, let the gradient $\nabla_{\mathbf{w}}f_{\mathbf{w}}(\mathbf{x})$ be defined as

$$ \nabla_{\mathbf{w}}f_{\mathbf{w}}(\mathbf{x}) = 
\begin{bmatrix}
\frac{\delta f_{\mathbf{w}}(\mathbf{x})}{\delta w_1} \\
\frac{\delta f_{\mathbf{w}}(\mathbf{x})}{\delta w_2} \\
\vdots \\
\frac{\delta f_{\mathbf{w}}(\mathbf{x})}{\delta w_p} \\
\end{bmatrix}
$$
where
$$
\frac{\delta f_{\mathbf{w}}(\mathbf{x})}{\delta w_m} =
\begin{cases}
    \phi(a_i^Tx + b_i)            & \text{if } w_m = \alpha_i \text{ for some } i \\
    1                             & \text{if } w_m = \beta \\
    \alpha_i\phi'(a_i^Tx + b_i)x_j & \text{if } w_m = a_{ij} \text{ for some } (i,j) \\
    \alpha_i\phi'(a_i^Tx + b_i)    & \text{if } w_m = b_i \text{ for some } i \\
\end{cases}
$$

Here, $\phi(x): \mathbb{R} \rightarrow \mathbb{R}$ and its derivative $\phi'(x)$ are defined as
$$ \begin{aligned}
    \phi(x) &= \frac{e^x - e^{-x}}{e^x + e^{-x}}                     && = \tanh(x)  \\
   \phi'(x) &= 1 - \Bigl( \frac{e^x - e^{-x}}{e^x + e^{-x}} \Bigr)^2 && = 1 - \tanh^2(x)  \\
   \end{aligned} $$