In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
dataset = pd.read_csv("./mnist_train.csv")
data_array = np.array(dataset)
np.random.shuffle(data_array)
test_set = data_array[:1000].T  
train_set = data_array[1000:].T 
test_labels = test_set[0]  
test_features = test_set[1:]  
train_labels = train_set[0]
train_features = train_set[1:]
train_features = train_features / 255.0
test_features = test_features / 255.0


In [3]:
def initialize_parameters():
    weight1 = np.random.randn(10, 784)*0.01
    bias1 = np.zeros((10, 1))
    Weight2 = np.random.randn(10, 10)*0.01
    bias2 = np.zeros((10, 1))
    return weight1, bias1, Weight2, bias2


In [4]:
def encode_labels(labels):
    labels = labels.reshape(-1, 1)
    encoder = OneHotEncoder()
    encoded_values = encoder.fit_transform(labels)
    return encoded_values.T.toarray()

In [5]:
def relu_activation(Z):
    return np.maximum(0,Z)

def softmax_activation(Z):
    return np.exp(Z) / np.sum(np.exp(Z), axis=0)

def relu_derivative(Z):
    return Z>0


In [6]:
def forward_propagation(Weight1, bias1, Weight2, bias2, features):
    linear1 = np.dot(Weight1, features) + bias1
    activation1 = relu_activation(linear1)
    linear2 = np.dot(Weight2, activation1) + bias2
    activation2 = softmax_activation(linear2)
    return linear1, activation1, linear2, activation2


In [7]:
def backward_propagation(linear1, activation1, activation2, weight2, features, labels):
    encoded_labels =encode_labels(labels)
    derivative_linear2 = activation2 - encoded_labels
    derivative_weight2 = 1 / features.shape[1] * np.dot(derivative_linear2, activation1.T)
    derivative_bias2 = 1 / features.shape[1] * np.sum(derivative_linear2, axis=1, keepdims=True)
    derivative_linear1 = np.dot(weight2.T, derivative_linear2) * relu_derivative(linear1)
    derivative_Weight1 = 1 / features.shape[1] * np.dot(derivative_linear1, features.T)
    derivative_bias1 = 1 / features.shape[1] * np.sum(derivative_linear1, axis=1, keepdims=True)
    return derivative_Weight1, derivative_bias1, derivative_weight2, derivative_bias2

In [8]:
def update_parameters(weight1, bias1, weight2, bias2, derivative_Weight1, derivative_bias1, derivative_Weight2, derivative_bias2, learning_rate):
    weight1 = weight1 - learning_rate * derivative_Weight1
    bias1 = bias1 - learning_rate * derivative_bias1
    weight2 = weight2 - learning_rate * derivative_Weight2
    bias2 = bias2 - learning_rate * derivative_bias2
    return weight1, bias1, weight2, bias2

def get_predictions(activation2):
    return np.argmax(activation2, axis=0)

In [9]:
def calculate_accuracy(predictions, labels):
    return np.mean(predictions == labels)

In [10]:
def gradient_descent(features, labels, iterations, learning_rate):
    weight1, bias1, weight2, bias2 = initialize_parameters()
    for i in range(iterations):
        linear1, activation1, linear2, activation2 = forward_propagation(
            weight1, bias1, weight2, bias2, features)
        derivative_weight1, derivative_bias1, derivative_weight2, derivative_bias2 = backward_propagation(
            linear1, activation1, activation2, weight2, features, labels)
        weight1, bias1, weight2, bias2 = update_parameters(
            weight1, bias1, weight2, bias2, derivative_weight1, derivative_bias1, derivative_weight2, derivative_bias2, learning_rate)
        if i % 10 == 0:
            accuracy = calculate_accuracy(get_predictions(activation2), labels)
            print(f"Iteration: {i}, Accuracy: {accuracy:.4f}")
    return weight1, bias1, weight2, bias2

In [11]:
weight1, bias1, weight2, bias2 = gradient_descent(train_features, train_labels, 500, 0.1)

Iteration: 0, Accuracy: 0.1178
Iteration: 10, Accuracy: 0.2110
Iteration: 20, Accuracy: 0.2775
Iteration: 30, Accuracy: 0.2764
Iteration: 40, Accuracy: 0.2110
Iteration: 50, Accuracy: 0.1943
Iteration: 60, Accuracy: 0.2263
Iteration: 70, Accuracy: 0.2937
Iteration: 80, Accuracy: 0.3629
Iteration: 90, Accuracy: 0.4288
Iteration: 100, Accuracy: 0.5014
Iteration: 110, Accuracy: 0.5605
Iteration: 120, Accuracy: 0.6105
Iteration: 130, Accuracy: 0.6578
Iteration: 140, Accuracy: 0.6934
Iteration: 150, Accuracy: 0.7170
Iteration: 160, Accuracy: 0.7326
Iteration: 170, Accuracy: 0.7451
Iteration: 180, Accuracy: 0.7562
Iteration: 190, Accuracy: 0.7666
Iteration: 200, Accuracy: 0.7771
Iteration: 210, Accuracy: 0.7871
Iteration: 220, Accuracy: 0.7971
Iteration: 230, Accuracy: 0.8061
Iteration: 240, Accuracy: 0.8140
Iteration: 250, Accuracy: 0.8222
Iteration: 260, Accuracy: 0.8292
Iteration: 270, Accuracy: 0.8358
Iteration: 280, Accuracy: 0.8421
Iteration: 290, Accuracy: 0.8479
Iteration: 300, Accur