In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [4]:
X = torch.rand(1, 28, 28, device=device)

In [5]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512), # Input layer
            nn.ReLU(),
            nn.Linear(512, 512), # Hidden Layer
            nn.ReLU(),
            nn.Linear(512, 10), # Output Layer
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [6]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [7]:
logits = model(X)

In [8]:
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([8])


In [9]:
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


In [10]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [11]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [12]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[-0.0680, -0.3143, -0.3142, -0.2266,  0.2366,  0.0488, -0.1937, -0.4010,
          0.0848, -0.0863,  0.1842,  0.0836,  0.4052,  0.6863,  0.0489, -0.1262,
          0.2827, -0.2461, -0.8536, -0.3966],
        [-0.2329, -0.4751, -0.2644, -0.2485,  0.0110,  0.2468,  0.2888, -0.4714,
          0.0653, -0.2343,  0.3812,  0.0938,  0.3589,  0.4920,  0.1791, -0.0698,
          0.0288,  0.1600, -0.8226, -0.4423],
        [ 0.0612, -0.2264, -0.1887, -0.0633, -0.1720,  0.0297,  0.0690, -0.0943,
          0.3997, -0.0063,  0.4977,  0.5062,  0.2971,  0.9101,  0.3355,  0.0861,
          0.3104, -0.4624, -0.7838, -0.0402]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.2366, 0.0488, 0.0000, 0.0000, 0.0848,
         0.0000, 0.1842, 0.0836, 0.4052, 0.6863, 0.0489, 0.0000, 0.2827, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0110, 0.2468, 0.2888, 0.0000, 0.0653,
         0.0000, 0.3812, 0.0938, 0.3589, 0.4920, 0.17

In [13]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

In [14]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

In [15]:
pred_probab

tensor([[0.1072, 0.0935, 0.1104, 0.0719, 0.0887, 0.0989, 0.1091, 0.0981, 0.0991,
         0.1231],
        [0.1019, 0.0930, 0.1054, 0.0739, 0.0884, 0.1022, 0.1021, 0.0958, 0.1106,
         0.1268],
        [0.1135, 0.0815, 0.0970, 0.0746, 0.0820, 0.1087, 0.1110, 0.0811, 0.1184,
         0.1321]], grad_fn=<SoftmaxBackward0>)

In [16]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0164, -0.0220,  0.0261,  ..., -0.0349,  0.0123, -0.0068],
        [-0.0127,  0.0111,  0.0004,  ..., -0.0162, -0.0037,  0.0121]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([-0.0217,  0.0203], grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0150,  0.0295,  0.0251,  ...,  0.0365, -0.0089, -0.0015],
        [-0.0184, -0.0133, -0.0341,  ..., -0.0037,  0.0300, -0.0361]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.bias | 

In [17]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

# Get Device for Training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [18]:
# Download training data from open datasets.
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 81058492.22it/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 15323933.44it/s]


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 22051115.64it/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 16522574.82it/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw






In [19]:
# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


In [20]:
import plotly.express as px

images, labels = next(iter(train_dataloader)) # Load all images and their labels

# print a single image in train dataset
px.imshow(images[0].reshape(28,28), color_continuous_scale="gray") 

In [21]:
# print the label for the above image from train dataset
print(labels[0]) # Print 

tensor(5)


In [22]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_model = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_model(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_model): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [42]:
loss_fn = nn.CrossEntropyLoss() # Loss Function
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) #Optimizer
batch_size = 64
epochs=5

size = len(train_dataloader.dataset)
model.train()

for e in range(epochs):
  for batch, (X, y) in enumerate(train_dataloader):
    X, y = X.to(device), y.to(device)

    # Compute prediction error
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 100 == 0:
      loss, current = loss.item(), (batch + 1) * len(X)
      print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

loss: 0.346276  [   32/60000]
loss: 0.390695  [ 3232/60000]
loss: 0.344261  [ 6432/60000]
loss: 0.257215  [ 9632/60000]
loss: 0.233016  [12832/60000]
loss: 0.435435  [16032/60000]
loss: 0.236945  [19232/60000]
loss: 0.215993  [22432/60000]
loss: 0.224045  [25632/60000]
loss: 0.142885  [28832/60000]
loss: 0.365377  [32032/60000]
loss: 0.252416  [35232/60000]
loss: 0.262634  [38432/60000]
loss: 0.400013  [41632/60000]
loss: 0.371546  [44832/60000]
loss: 0.200286  [48032/60000]
loss: 0.348758  [51232/60000]
loss: 0.204124  [54432/60000]
loss: 0.293520  [57632/60000]
loss: 0.330011  [   32/60000]
loss: 0.384511  [ 3232/60000]
loss: 0.338877  [ 6432/60000]
loss: 0.250199  [ 9632/60000]
loss: 0.224786  [12832/60000]
loss: 0.421661  [16032/60000]
loss: 0.232005  [19232/60000]
loss: 0.208122  [22432/60000]
loss: 0.216344  [25632/60000]
loss: 0.138288  [28832/60000]
loss: 0.361872  [32032/60000]
loss: 0.247137  [35232/60000]
loss: 0.258327  [38432/60000]
loss: 0.394129  [41632/60000]
loss: 0.36

In [24]:
# Load all test images and their labels
test_imgs, test_labels = next(iter(test_dataloader)) 

In [25]:
# print a single image and it's label
img_0 = test_imgs[0].reshape(28,28)
px.imshow(img_0, color_continuous_scale="gray")

In [26]:
# Print actual label
label_0 = test_labels[0]
print(label_0) 

tensor(7)


In [27]:
img_0 = img_0.reshape(1,28,28)

# Print predicted label
pred= model(img_0)
pred_probab = nn.Softmax(dim=1)(pred)
img_0_pred_label = pred_probab.argmax(1)
print(img_0_pred_label)

tensor([7])


In [28]:
# print a single image and it's label
img_63 = test_imgs[63].reshape(28,28)
px.imshow(img_63, color_continuous_scale="gray")

In [29]:
# Print actual label
label_63 = test_labels[63]
print(label_63) 

tensor(3)


In [30]:
img_63 = img_63.reshape(1,28,28)

# Print predicted label
pred= model(img_63)
pred_probab = nn.Softmax(dim=1)(pred)
img_63_pred_label = pred_probab.argmax(1)
print(img_63_pred_label)

tensor([2])


In [31]:
size = len(test_dataloader.dataset)
num_batches = len(test_dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
  for X, y in test_dataloader:
    X, y = X.to(device), y.to(device)
    pred = model(X)
    test_loss += loss_fn(pred, y).item()
    correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

Test Error: 
 Accuracy: 72.7%, Avg loss: 1.638563 



In [32]:
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

Saved PyTorch Model State to model.pth


In [33]:
# print a single image and it's label
img_1 = test_imgs[1].reshape(28,28)
px.imshow(img_1, color_continuous_scale="gray")

In [34]:
# Print actual label
label_1 = test_labels[1]
print(label_1) 

tensor(2)


In [35]:
img_1 = img_1.reshape(1,28,28)

# Print predicted label
pred= model(img_1)
pred_probab = nn.Softmax(dim=1)(pred)
img_1_pred_label = pred_probab.argmax(1)
print(img_1_pred_label)

tensor([2])


## TASK I


a. Report the results after increasing the batch size to 32 and the number of epochs to 10. In the **Tutorial Completion Document**, record your observations (e.g., a change in accuracy).

In [37]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

# Get Device for Training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Read or Load the input data
# Download training data from open datasets.
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

# Creating a Model
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_model = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_model(x)
        return logits

model = NeuralNetwork().to(device)

# Traing the model with traing data
loss_fn = nn.CrossEntropyLoss() # Loss Function
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) #Optimizer

# Updated batch 32 and epoch 10
batch_size = 32
epochs = 10

# Create dataloaders for training and testing data
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

size = len(train_dataloader.dataset)
model.train()

for t in range(epochs):
  for batch, (X, y) in enumerate(train_dataloader):
    X, y = X.to(device), y.to(device)

    # Compute prediction error
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 100 == 0:
      loss, current = loss.item(), (batch + 1) * len(X)
      print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

# Testing or Evaluating the model with testing data
size = len(test_dataloader.dataset)
num_batches = len(test_dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
  for X, y in test_dataloader:
    X, y = X.to(device), y.to(device)
    pred = model(X)
    test_loss += loss_fn(pred, y).item()
    correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

Using cpu device
loss: 2.302252  [   32/60000]
loss: 2.302940  [ 3232/60000]
loss: 2.292367  [ 6432/60000]
loss: 2.292908  [ 9632/60000]
loss: 2.299636  [12832/60000]
loss: 2.285648  [16032/60000]
loss: 2.258352  [19232/60000]
loss: 2.268222  [22432/60000]
loss: 2.264199  [25632/60000]
loss: 2.246337  [28832/60000]
loss: 2.250418  [32032/60000]
loss: 2.261036  [35232/60000]
loss: 2.239156  [38432/60000]
loss: 2.207543  [41632/60000]
loss: 2.253628  [44832/60000]
loss: 2.219323  [48032/60000]
loss: 2.186067  [51232/60000]
loss: 2.211860  [54432/60000]
loss: 2.190864  [57632/60000]
loss: 2.187662  [   32/60000]
loss: 2.198709  [ 3232/60000]
loss: 2.154913  [ 6432/60000]
loss: 2.172540  [ 9632/60000]
loss: 2.215027  [12832/60000]
loss: 2.181857  [16032/60000]
loss: 2.072994  [19232/60000]
loss: 2.101500  [22432/60000]
loss: 2.085559  [25632/60000]
loss: 2.012249  [28832/60000]
loss: 2.063972  [32032/60000]
loss: 2.100035  [35232/60000]
loss: 2.019463  [38432/60000]
loss: 1.956758  [41632/

Task 1.a Findings:

The change in hyperparameters from a batch size of 64 with epoch 5 to a batch size of 32 with epoch 10 led to an improvement in the performance of the model. The original accuracy of 72.7% and average loss of 1.638563 was quite a bit lower than the new accuracy of 89.2% and average loss of 0.38647.

Reducing the batch size to 32 likely allowed for faster convergence with more updates to the weights of the model in each epoch. More epochs allow longer training for complex features that can extrapolate to data that isn't as well defined and has more generic structure (i.e a number 3 that is written such that it looks somewhere in between an 8 and a 3).

b. Construct a new model with an additional hidden layer containing the input and output neurons listed below.

*   input layer = 28*28 - 512
*   First hidden layer = 512 - 256
*   Second hidden layer = 256 - 128
*   Final layer = 128 -10 

Train the new model with the same MNIST data and document any observations (e.g., accuracy change) in the **Tutorial Completion Document**.

In [47]:
# Import the necessary libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import numpy as np

# Model Class
class TutorialModel(nn.Module):
    def __init__(self):
        super(TutorialModel, self).__init__()
        self.flatten = nn.Flatten()
        self.layer1 = nn.Linear(28*28, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 128)
        self.layer4 = nn.Linear(128, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.layer4(x)
        return x

# Get Device for Training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Read or Load the input data
# Download training data from open datasets.
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Batch and epoch
batch_size = 32
epochs = 10

# Data Loader
train_dataloader = DataLoader(training_data, batch_size=batch_size)

# Initialize model
model = TutorialModel().to(device)

# Loss function + optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# Model Train
for epoch in range(epochs):
    for batch, (images, labels) in enumerate(train_dataloader):
        images, labels = images.to(device), labels.to(device)

        # Forward & backward propagation + optimization
        outputs = model(images)
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Model test
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
model.eval()
confusion_matrix = np.zeros((10,10), dtype=int)
with torch.no_grad():
    for images, labels in test_dataloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, dim=1)
        for i in range(len(labels)):
            confusion_matrix[labels[i]][predicted[i]] += 1

# Get the number of samples for each class
class_totals = confusion_matrix.sum(axis=1)

# Calculate false positives, false negatives, and accuracy for each class
accuracy = 0
for i in range(10):
    tp = confusion_matrix[i, i]
    fp = confusion_matrix[:, i].sum() - tp
    fn = class_totals[i] - tp
    tn = confusion_matrix.sum() - tp - fp - fn
    class_accuracy = tp / class_totals[i]
    class_false_positives = fp / (fp + tn)
    class_false_negatives = fn / (fn + tp)
    print(f"Class {i}:")
    print(f"\tFalse Positives: {class_false_positives:.2%}")
    print(f"\tFalse Negatives: {class_false_negatives:.2%}")
    print(f"\tAccuracy: {class_accuracy:.2%}")
    accuracy += tp

accuracy /= confusion_matrix.sum()
print(f"\nOverall Accuracy: {accuracy:.2%}")

Using cpu device
Epoch [1/10], Loss: 2.2860
Epoch [2/10], Loss: 2.2663
Epoch [3/10], Loss: 2.2226
Epoch [4/10], Loss: 2.1017
Epoch [5/10], Loss: 1.7474
Epoch [6/10], Loss: 1.1584
Epoch [7/10], Loss: 0.7610
Epoch [8/10], Loss: 0.5609
Epoch [9/10], Loss: 0.4447
Epoch [10/10], Loss: 0.3714
Class 0:
	False Positives: 0.96%
	False Negatives: 3.27%
	Accuracy: 96.73%
Class 1:
	False Positives: 0.77%
	False Negatives: 2.64%
	Accuracy: 97.36%
Class 2:
	False Positives: 0.93%
	False Negatives: 18.51%
	Accuracy: 81.49%
Class 3:
	False Positives: 2.18%
	False Negatives: 16.24%
	Accuracy: 83.76%
Class 4:
	False Positives: 2.06%
	False Negatives: 17.21%
	Accuracy: 82.79%
Class 5:
	False Positives: 1.87%
	False Negatives: 25.90%
	Accuracy: 74.10%
Class 6:
	False Positives: 1.15%
	False Negatives: 8.98%
	Accuracy: 91.02%
Class 7:
	False Positives: 1.04%
	False Negatives: 13.04%
	Accuracy: 86.96%
Class 8:
	False Positives: 2.08%
	False Negatives: 20.74%
	Accuracy: 79.26%
Class 9:
	False Positives: 2.91

Task1.b findings:

I noticed that the accuracy declined from slightly over 89% to just under 86%.A 3% loss in accuracy may not seem substantial but going from 89% to 86% accuracy means 30% more errors are being made in relation to the previous model. There are potential reasons why the accuracy may have declined.

One reason could be that the model is overfitting with the new hidden layers. The model may not extrapolate to generic data well if its too specialized. Thus, it may perform well on the training data but not test data.

Another reason could be the gradient is either too small or too large with the new hidden layers. So, the weights in the model may not be converging like we would want and causing some degree of inaccuracy.

I think there are methods to test the gradient and if the model is overfitting but admittedly I would need time to do this and learn more about it since I don't have much practice with that.

A change I made to the model was to add in a confusion matrix and get the false positive/negative (Type I/Type II errors) for each class to see which numbers the model excelled in predicting and struggled with predicting. The results are printed in the previous cell, but the model did exceptionally well with '0' and '1' with a 96%,97% accuracy respectively. The model lagged behind in '5' and '8' with 74%,79% accuracy respectively.

I think a likely explanation for this is that the structure of 1 and 0 are more simple and 5 and 8 are more complex. So, human handwriting samples may have more variance for 5,8 versus 0,1. 

There are different networks/methodologies that may work better than the one used here. Using a convolutional nueral network(CNN) would be better at processing images like what is included in the dataset since the training is faster. With the current neural network, the data could be transformed by manipulating the data equally for each image (increasing contrast/coloring/mirroring the images) and trained again to reduce overfitting.