In [1]:
pip install keras


Note: you may need to restart the kernel to use updated packages.




Import the Dataset:
In your Python script or Jupyter Notebook, import the Fashion-MNIST dataset from Keras:

In [2]:
from keras.datasets import fashion_mnist
import numpy as np

Load the Dataset:
Use the load_data() function from the fashion_mnist module to load the dataset. This function returns training and testing data along with their corresponding labels:



In [3]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

Explore the Dataset: We will explore the dataset to understand its structure and contents. We can check the shape of the training and testing data arrays.

In [4]:
# Print the shape of data
print("Shape of training data (images):", X_train.shape)
print("Shape of testing data (images):", X_test.shape)


Shape of training data (images): (60000, 28, 28)
Shape of testing data (images): (10000, 28, 28)


Preprocessing: We will preprocess the data by normalizing the pixel values to the range [0, 1] and flattening the images to a 1D array.

In [5]:
# Preprocessing
# Normalize the pixel values to the range [0, 1]
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Flatten the images to a 1D array
X_train_flat = X_train.reshape((X_train.shape[0], -1))
X_test_flat = X_test.reshape((X_test.shape[0], -1))

# Print the shape of flattened data
print("Shape of flattened training data:", X_train_flat.shape)
print("Shape of flattened testing data:", X_test_flat.shape)
print("Number of features (dimension) of input data:", X_train_flat.shape[1])


Shape of flattened training data: (60000, 784)
Shape of flattened testing data: (10000, 784)
Number of features (dimension) of input data: 784


Question 1 - Download the fashion-MNIST dataset and plot 1 sample image for each class as shown in the grid below. Use from keras.datasets import fashion_mnist for getting the fashion mnist dataset.

Plot one sample image for each class in the dataset

In [6]:
# Define class labels
class_labels = {
    0: "T-shirt/top",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot"
}

# Plot one sample image for each class
fig, axes = plt.subplots(2, 5, figsize=(15, 7))
for i, ax in enumerate(axes.flat):
    # Find index of the first occurrence of class i in y_train
    idx = next(idx for idx, label in enumerate(y_train) if label == i)
    ax.imshow(X_train[idx], cmap='gray')
    ax.set_title(class_labels[i])
    ax.axis('off')

plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

Question 2 (10 Marks)
Implement a feedforward neural network which takes images from the fashion-mnist data as input and outputs a probability distribution over the 10 classes.

Your code should be flexible such that it is easy to change the number of hidden layers and the number of neurons in each hidden layer.

To implement a feedforward neural network for the Fashion-MNIST dataset, we'll create a class that represents the neural network. This class should allow for flexibility in terms of the number of hidden layers and neurons in each hidden layer. We'll use the sigmoid activation function for the hidden layers and the softmax activation function for the output layer to obtain a probability distribution over the 10 classes.

In [None]:
import numpy as np

class FeedforwardNeuralNetwork:
    def __init__(self, input_size, hidden_layer_sizes, output_size):
        self.input_size = input_size
        self.hidden_layer_sizes = hidden_layer_sizes
        self.output_size = output_size
        self.num_layers = len(hidden_layer_sizes) + 1
        self.weights, self.biases = self.initialize_weights_and_biases()

    def initialize_weights_and_biases(self):
        sizes = [self.input_size] + self.hidden_layer_sizes + [self.output_size]
        weights = [np.random.randn(next_size, prev_size) for prev_size, next_size in zip(sizes[:-1], sizes[1:])]
        biases = [np.random.randn(size, 1) for size in sizes[1:]]
        return weights, biases

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=0))  # for numerical stability
        return exp_z / np.sum(exp_z, axis=0)

    def forward(self, X):
        activations = [X]
        for w, b in zip(self.weights, self.biases):
            z = np.dot(w, activations[-1]) + b
            a = self.sigmoid(z) if w is not self.weights[-1] else self.softmax(z)
            activations.append(a)
        return activations

    def predict(self, X):
        return self.forward(X)[-1]

In [None]:
# Example usage
input_size = 784  # Number of input features (size of each image)
hidden_layer_sizes = [64, 32]  # Number of neurons in each hidden layer
output_size = 10  # Number of output classes

# Initialize feedforward neural network
model = FeedforwardNeuralNetwork(input_size, hidden_layer_sizes, output_size)

# Generate random input data (batch size of 1 and 784 features)
input_data = np.random.rand(input_size, 1)

# Forward pass through the network
output_probabilities = model.forward(input_data)

# Display the output probabilities corresponding to each class
class_labels = {
    0: "T-shirt/top",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot"
}

for i, prob in enumerate(output_probabilities[-1].flatten()):
    print(f"Probability of class {class_labels[i]}: {prob:.4f}")

In [15]:
import numpy as np

class NeuralNetwork:
    def __init__(self, input_size, hidden_layer_sizes, output_size):
        self.input_size = input_size
        self.hidden_layer_sizes = hidden_layer_sizes
        self.output_size = output_size
        self.num_layers = len(hidden_layer_sizes) + 1
        self.weights, self.biases = self.initialize_weights_and_biases()
        # Variables for optimization algorithms
        self.velocities_w = [np.zeros_like(w) for w in self.weights]
        self.velocities_b = [np.zeros_like(b) for b in self.biases]
        self.momentums_w = [np.zeros_like(w) for w in self.weights]
        self.momentums_b = [np.zeros_like(b) for b in self.biases]
        self.squared_gradients_w = [np.zeros_like(w) for w in self.weights]
        self.squared_gradients_b = [np.zeros_like(b) for b in self.biases]
        self.iterations = 0
        
    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        accuracy = np.mean(np.argmax(predictions, axis=0) == np.argmax(y_test, axis=0))
        return accuracy
    
    def predict(self, X):
        activations = self.forward(X)
        return activations[-1]
    
    def initialize_weights_and_biases(self):
        sizes = [self.input_size] + self.hidden_layer_sizes + [self.output_size]
        weights = [np.random.randn(next_size, prev_size) for prev_size, next_size in zip(sizes[:-1], sizes[1:])]
        biases = [np.random.randn(size, 1) for size in sizes[1:]]
        return weights, biases
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=0))  # for numerical stability
        return exp_z / np.sum(exp_z, axis=0)
    
    def forward(self, X):
        activations = [X]
        for w, b in zip(self.weights, self.biases):
            z = np.dot(w, activations[-1]) + b
            a = self.sigmoid(z) if w is not self.weights[-1] else self.softmax(z)
            activations.append(a)
        return activations
    
    def backward(self, X, y):
        m = X.shape[1]  # Number of samples
        activations = self.forward(X)
        deltas = [None] * self.num_layers
        gradients = [None] * (self.num_layers - 1)
        
        # Compute error for output layer
        deltas[-1] = activations[-1] - y
        
        # Backpropagate the error
        for l in range(self.num_layers - 2, -1, -1):
            gradients[l] = np.dot(deltas[l+1], activations[l].T) / m
            deltas[l] = np.dot(self.weights[l+1].T, deltas[l+1]) * activations[l] * (1 - activations[l])
        
        return gradients
    
    def update_parameters(self, gradients, learning_rate, optimizer):
        self.iterations += 1  # Increment iteration count
        if optimizer == 'sgd':
            self.sgd_update(gradients, learning_rate)
        elif optimizer == 'momentum':
            self.momentum_update(gradients, learning_rate, momentum=0.9)
        elif optimizer == 'nesterov':
            self.nesterov_update(gradients, learning_rate, momentum=0.9)
        elif optimizer == 'rmsprop':
            self.rmsprop_update(gradients, learning_rate, beta=0.9, epsilon=1e-8)
        elif optimizer == 'adam':
            self.adam_update(gradients, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
        elif optimizer == 'nadam':
            self.nadam_update(gradients, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
        else:
            raise ValueError("Unknown optimizer")
    
    def sgd_update(self, gradients, learning_rate):
        for l in range(self.num_layers - 1):
            self.weights[l] -= learning_rate * gradients[l]
            self.biases[l] -= learning_rate * np.mean(gradients[l], axis=1, keepdims=True)
    
    def momentum_update(self, gradients, learning_rate, momentum):
        for l in range(self.num_layers - 1):
            self.velocities_w[l] = momentum * self.velocities_w[l] - learning_rate * gradients[l]
            self.velocities_b[l] = momentum * self.velocities_b[l] - learning_rate * np.mean(gradients[l], axis=1, keepdims=True)
            self.weights[l] += self.velocities_w[l]
            self.biases[l] += self.velocities_b[l]

    def nesterov_update(self, gradients, learning_rate, momentum):
        for l in range(self.num_layers - 1):
            self.velocities_w[l] = momentum * self.velocities_w[l] - learning_rate * gradients[l]
            self.velocities_b[l] = momentum * self.velocities_b[l] - learning_rate * np.mean(gradients[l], axis=1, keepdims=True)
            self.weights[l] += momentum * self.velocities_w[l] - learning_rate * gradients[l]
            self.biases[l] += momentum * self.velocities_b[l] - learning_rate * np.mean(gradients[l], axis=1, keepdims=True)

    def rmsprop_update(self, gradients, learning_rate, beta, epsilon):
        for l in range(self.num_layers - 1):
            self.squared_gradients_w[l] = beta * self.squared_gradients_w[l] + (1 - beta) * np.square(gradients[l])
            self.squared_gradients_b[l] = beta * self.squared_gradients_b[l] + (1 - beta) * np.square(np.mean(gradients[l], axis=1, keepdims=True))
            self.weights[l] -= learning_rate * gradients[l] / (np.sqrt(self.squared_gradients_w[l]) + epsilon)
            self.biases[l] -= learning_rate * np.mean(gradients[l], axis=1, keepdims=True) / (np.sqrt(self.squared_gradients_b[l]) + epsilon)

    def adam_update(self, gradients, learning_rate, beta1, beta2, epsilon):
        for l in range(self.num_layers - 1):
            self.momentums_w[l] = beta1 * self.momentums_w[l] + (1 - beta1) * gradients[l]
            self.velocities_w[l] = beta2 * self.velocities_w[l] + (1 - beta2) * np.square(gradients[l])
            self.weights[l] -= learning_rate * self.momentums_w[l] / (np.sqrt(self.velocities_w[l]) + epsilon)
            
            self.momentums_b[l] = beta1 * self.momentums_b[l] + (1 - beta1) * np.mean(gradients[l], axis=1, keepdims=True)
            self.velocities_b[l] = beta2 * self.velocities_b[l] + (1 - beta2) * np.square(np.mean(gradients[l], axis=1, keepdims=True))
            self.biases[l] -= learning_rate * self.momentums_b[l] / (np.sqrt(self.velocities_b[l]) + epsilon)

    def nadam_update(self, gradients, learning_rate, beta1, beta2, epsilon):
        for l in range(self.num_layers - 1):
            self.momentums_w[l] = beta1 * self.momentums_w[l] + (1 - beta1) * gradients[l]
            self.velocities_w[l] = beta2 * self.velocities_w[l] + (1 - beta2) * np.square(gradients[l])
            self.momentums_w_corrected = self.momentums_w[l] / (1 - np.power(beta1, self.iterations))
            self.velocities_w_corrected = self.velocities_w[l] / (1 - np.power(beta2, self.iterations))
            self.weights[l] -= learning_rate * (self.momentums_w_corrected / (np.sqrt(self.velocities_w_corrected) + epsilon))

            self.momentums_b[l] = beta1 * self.momentums_b[l] + (1 - beta1) * np.mean(gradients[l], axis=1, keepdims=True)
            self.velocities_b[l] = beta2 * self.velocities_b[l] + (1 - beta2) * np.square(np.mean(gradients[l], axis=1, keepdims=True))
            self.momentums_b_corrected = self.momentums_b[l] / (1 - np.power(beta1, self.iterations))
            self.velocities_b_corrected = self.velocities_b[l] / (1 - np.power(beta2, self.iterations))
            self.biases[l] -= learning_rate * (self.momentums_b_corrected / (np.sqrt(self.velocities_b_corrected) + epsilon))
    
    def train(self, X_train, y_train, epochs, batch_size, learning_rate, optimizer):
        m = X_train.shape[1]  # Number of training samples
        num_batches = m // batch_size
        for epoch in range(epochs):
            for i in range(num_batches):
                start = i * batch_size
                end = start + batch_size
                X_batch = X_train[:, start:end]
                y_batch = y_train[:, start:end]
                
                gradients = self.backward(X_batch, y_batch)
                self.update_parameters(gradients, learning_rate, optimizer)



In [16]:
# Example usage
input_size = 784  # Number of input features (size of each image)
hidden_layer_sizes = [64, 32]  # Number of neurons in each hidden layer
output_size = 10  # Number of output classes

# Flatten the input data
X_test_flat = X_test.reshape((X_test.shape[0], -1))

# Make predictions on flattened data
predictions = model.predict(X_test_flat)


# Initialize feedforward neural network
model = NeuralNetwork(input_size, hidden_layer_sizes, output_size)

# Train the network
model.train(X_train, y_train, epochs=10, batch_size=32, learning_rate=0.01, optimizer='sgd')


ValueError: shapes (64,784) and (10000,784) not aligned: 784 (dim 1) != 10000 (dim 0)

In [14]:
# Evaluate the network on the test data
accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

# Make predictions on new data
new_data_predictions = model.predict(new_data)
print("Predictions on new data:", new_data_predictions)


ValueError: shapes (64,784) and (10000,28,28) not aligned: 784 (dim 1) != 28 (dim 1)