In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
train_df = train_df.sample(frac=1) #shuffle rows of dataframe
print(train_df.shape)
train_data = np.array(train_df) #convert to NumPy array

(42000, 785)


In [3]:
## 42,000 samples split into 80% for train, 20% for val. 785 columns, 784 features (28*28), 1 target label
train_X = train_data[:33600, 1:].T
train_y = train_data[:33600, 0].T
train_X = train_X / 255.
val_X = train_data[33600:, 1:].T
val_y = train_data[33600:, 0].T
val_X = val_X / 255.

In [4]:
def initialise_net(layer_sizes):
    #takes list of number of nodes in each 4 layers (layers 0 and 3 (input+output) are bounded)
    layer1_weights = np.random.rand(layer_sizes[1], layer_sizes[0]) - 0.5
    layer2_weights = np.random.rand(layer_sizes[2], layer_sizes[1]) - 0.5
    layer3_weights = np.random.rand(layer_sizes[3], layer_sizes[2]) - 0.5
    layer1_bias = np.random.rand(layer_sizes[1], 1) - 0.5
    layer2_bias = np.random.rand(layer_sizes[2], 1) - 0.5
    layer3_bias = np.random.rand(layer_sizes[3], 1) - 0.5
    return layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias

def relu(z):
    return np.maximum(z, 0)

def antirelu(z):
    return z > 0

def mish(z):
    return z*(np.tanh(np.log(1+np.exp(z))))

def sech(z):
    return 1/np.cosh(z)

def antimish(z):
    return np.tanh(np.log(1+np.exp(z)))+((z*np.exp(z)*(sech(1+np.exp(z))**2))/(1+np.exp(z)))

def softmax(z):
    """Compute softmax values for each sets of scores in x."""
    exp = np.exp(z - np.max(z)) 
    return exp / exp.sum(axis=0)    

def forward(l1_weights, l2_weights, l3_weights, l1_bias, l2_bias, l3_bias, X, activation = "ReLU", output = "prob"):
    z1 = l1_weights.dot(X) + l1_bias
    if (activation == "ReLU"):
        act1 = relu(z1)
    else:
        act1 = mish(z1)
    z2 = l2_weights.dot(act1) + l2_bias
    if (activation == "ReLU"):
        act2 = relu(z2)
    else:
        act2 = mish(z2)
    z3 = l3_weights.dot(act2) + l3_bias
    if (output == "prob"):
        out = softmax(z3)
    else:
        out = z3
    return z1, act1, z2, act2, z3, out

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward(z1, act1, z2, act2, z3, out, l1_weights, l2_weights, l3_weights, X, Y, activation = "ReLU"):
    m = Y.size
    Y = one_hot(Y)
    d_z3 = 2*(out - Y)
    d_l3_weights = 1/m * (d_z3.dot(act2.T))
    d_l3_bias = 1/m * np.sum(d_z3,1)
    if (activation == "ReLU"):
        d_z2 = l3_weights.T.dot(d_z3) * antirelu(z2)
    else:
        d_z2 = l3_weights.T.dot(d_z3) * antimish(z2)
    d_l2_weights = 1/m * d_z2.dot(act1.T)
    d_l2_bias = 1/m * np.sum(d_z2,1)
    if (activation == "ReLU"):
        d_z1 = l2_weights.T.dot(d_z2) * antirelu(z1)
    else:
        d_z1 = l2_weights.T.dot(d_z2) * antimish(z1)
    d_l1_weights = 1/m * d_z1.dot(X.T)
    d_l1_bias = 1/m * np.sum(d_z1,1)
    return d_l1_weights, d_l2_weights, d_l3_weights, d_l1_bias, d_l2_bias, d_l3_bias

def update_net(layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias,
                  d_l1_weights, d_l2_weights, d_l3_weights, d_l1_bias, d_l2_bias, d_l3_bias, lr):
    layer1_weights = layer1_weights - lr*d_l1_weights
    layer1_bias = layer1_bias - lr*np.reshape(d_l1_bias, (layer1_bias.shape))
    layer2_weights = layer2_weights - lr*d_l2_weights
    layer2_bias = layer2_bias - lr*np.reshape(d_l2_bias, (layer2_bias.shape))
    layer3_weights = layer3_weights - lr*d_l3_weights
    layer3_bias = layer3_bias - lr*np.reshape(d_l3_bias, (layer3_bias.shape))
    return layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias

def predict(out):
    return np.argmax(out, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, lr, n_epochs, layer_sizes, activation):
    layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias = initialise_net(layer_sizes)
    for i in range(n_epochs):
        z1, act1, z2, act2, z3, out = forward(layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias, X, activation = activation, output = "prob")
        d_l1_weights, d_l2_weights, d_l3_weights, d_l1_bias, d_l2_bias, d_l3_bias = backward(z1, act1, z2, act2, z3, out, layer1_weights, layer2_weights, layer3_weights, X, Y, activation = activation)
        layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias = update_net(layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias,
                  d_l1_weights, d_l2_weights, d_l3_weights, d_l1_bias, d_l2_bias, d_l3_bias, lr)
        if i % 25 == 0:
            print("Iteration: ", i)
            predictions = predict(out)
            print(get_accuracy(predictions, Y))
    return layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias

In [5]:
layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias = gradient_descent(
                                                                                            train_X, 
                                                                                            train_y, 
                                                                                            n_epochs=1001, 
                                                                                            lr=0.075,
                                                                                            layer_sizes=[784,16,16,10],
                                                                                            activation="Mish")



Iteration:  0
[1 4 1 ... 9 4 2] [5 7 5 ... 9 3 9]
0.10407738095238095
Iteration:  25
[5 0 8 ... 7 3 2] [5 7 5 ... 9 3 9]
0.39922619047619046
Iteration:  50
[5 6 8 ... 7 3 2] [5 7 5 ... 9 3 9]
0.5433333333333333
Iteration:  75
[8 4 8 ... 7 3 6] [5 7 5 ... 9 3 9]
0.6330059523809524
Iteration:  100
[8 9 8 ... 7 3 6] [5 7 5 ... 9 3 9]
0.7023511904761904
Iteration:  125
[8 9 8 ... 7 3 4] [5 7 5 ... 9 3 9]
0.7517261904761905
Iteration:  150
[8 9 8 ... 9 3 4] [5 7 5 ... 9 3 9]
0.784345238095238
Iteration:  175
[8 9 8 ... 9 3 4] [5 7 5 ... 9 3 9]
0.8096130952380952
Iteration:  200
[8 9 5 ... 9 3 4] [5 7 5 ... 9 3 9]
0.8260119047619048
Iteration:  225
[8 9 5 ... 9 3 4] [5 7 5 ... 9 3 9]
0.8369642857142857
Iteration:  250
[8 7 5 ... 9 3 4] [5 7 5 ... 9 3 9]
0.8469642857142857
Iteration:  275
[8 7 5 ... 9 3 4] [5 7 5 ... 9 3 9]
0.8538392857142857
Iteration:  300
[8 7 5 ... 9 3 4] [5 7 5 ... 9 3 9]
0.8601785714285715
Iteration:  325
[8 7 5 ... 9 3 4] [5 7 5 ... 9 3 9]
0.8652678571428571
Iteration:

In [6]:
def make_predictions(layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias, X, activation = "ReLU", output = "prob"):
    #_, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    z1, act1, z2, act2, z3, out = forward(layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias, X, activation = activation, output = "prob")
    predictions = predict(out)
    return predictions

val_predictions = make_predictions(layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias, val_X, activation = "mish", output = "prob")
get_accuracy(val_predictions, val_y)

[6 0 1 ... 6 9 8] [6 0 1 ... 6 9 8]


0.906547619047619

In [7]:
sample_submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
test_df = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [8]:
sample_submission

Unnamed: 0,ImageId,Label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
27995,27996,0
27996,27997,0
27997,27998,0
27998,27999,0


In [9]:
test_data = np.array(test_df)
test_X = test_data.T
test_X = test_X / 255.

In [10]:
test_predictions = make_predictions(layer1_weights, layer2_weights, layer3_weights, layer1_bias, layer2_bias, layer3_bias, test_X, activation = "mish", output = "prob")

In [11]:
test_submission = pd.DataFrame(test_predictions)
test_submission.columns = ['Label']
test_submission.insert(0, 'ImageId', range(1, 1 + len(test_submission)))

In [12]:
test_submission.to_csv("submission.csv", index=False, header=True)