In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
x_train = x_train.reshape(x_train.shape[0], -1).T
x_test = x_test.reshape(x_test.shape[0], -1).T

In [None]:
x_train.shape

(784, 60000)

In [None]:
# Initialize parameters(W, b)

In [None]:
def initialize_params(current_layer, previous_layer):

  W = np.random.randn(current_layer, previous_layer) * 0.01
  b = np.zeros((current_layer, 1))
  return W, b

In [None]:
def initialize_lparams(layers_dim):

  params = {}
  n = len(layers_dim)
  for l in range(1, n):
    W, b = initialize_params(layers_dim[l], layers_dim[l-1])
    params['W' + str(l)] = W
    params['b' + str(l)] = b

  return params

In [None]:
# Forward propagation

In [None]:
def linear_forward(W, A, b):
    Z = np.dot(W, A) + b
    cache = (W, A, b)
    return Z, cache

In [None]:
# Activtaion functions

def sigmoid(Z):
  A = 1/(1 + np.exp(-Z))
  cache = Z
  return A, cache

def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    A = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)
    cache = Z
    return A, cache

def relu(Z):
  A = np.maximum(0, Z)
  cache = Z
  return A, cache

In [None]:
# Function combining linear and activation functions
def linear_activation(A_prev, W, b, activ_func):

  if activ_func == 'softmax':
    Z, linear_cache = linear_forward(W, A_prev, b)
    A, Z_cache = softmax(Z)

  if activ_func == 'relu':
    Z, linear_cache = linear_forward(W, A_prev, b)
    A, Z_cache = relu(Z)

  cache = (linear_cache, Z_cache) # to use later in corresponding back-prop layer
  return A, cache

In [None]:
# Performing forward propagation on all layers

def forward_prop(X, params, layers_dim):

  caches = {}
  n = len(layers_dim)
  A = X

  for l in range(1,n):
      A_prev = A
      A, cache = linear_activation(A_prev, params['W' + str(l)], params['b' + str(l)], 'relu')
      caches['l' + str(l)] = cache

  AL, cache = linear_activation(A_prev, params['W' + str(l)], params['b' + str(l)], 'softmax')
  caches['l' + str(n-1)] = cache

  return AL, caches

In [None]:
def compute_cost(AL, Y):

  cost = -np.mean(Y * np.log(AL) + (1 - Y) * np.log(1 - AL)) #cross-entropy loss function
  return cost

In [None]:
# Calculating gradients of W, b and A
def linear_backward(dZ, cache):

  W, A_prev, b = cache
  m = A_prev.shape[1]

  dW = (1/m) * np.dot(dZ, A_prev.T)
  db = np.mean(dZ, axis=1, keepdims=True)
  dA = np.dot(W.T, dZ)

  return dW, db, dA

In [None]:
# Derivtaives of activation functions

def softmax_der(dA, cache):
  exp_Z = np.exp(cache - np.max(cache, axis=0, keepdims=True))
  softmax_Z = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

  dZ = dA * (softmax_Z * (1 - softmax_Z))
  return dZ

def relu_der(dA, cache):
  dZ = dA * (cache > 0)
  return dZ

In [None]:
#Combining above fuunctions to calculate dW, db, dA using previous dA
def linear_activation_backward(dA, cache, activation):

    linear_cache, activation_cache = cache

    if activation == 'softmax':
      dZ = softmax_der(dA, activation_cache)
      dW, db, dA = linear_backward(dZ, linear_cache)

    if activation == 'relu':
      dZ = relu_der(dA, activation_cache)
      dW, db, dA = linear_backward(dZ, linear_cache)

    return dW, db, dA

In [None]:
#Backward-propagation

def back_prop(AL, Y, caches):

  grads = {}# Stores all gradients for updating later

  dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))# derivative of loss function wrt AL
  l = len(caches)
  current_cache = caches['l'+str(l)]
  dW, db, dA_prev = linear_activation_backward(dAL, current_cache, 'softmax')
  grads['dW' + str(l)] = dW
  grads['db' + str(l)] = db
  grads['dA' + str(l-1)] = dA_prev


  for i in reversed(range(1, l)):

    current_cache = caches['l'+str(i)]
    dW, db, dA_prev = linear_activation_backward(dA_prev, current_cache, 'relu')
    grads['dW' + str(i)] = dW
    grads['db' + str(i)] = db
    grads['dA' + str(i-1)] = dA_prev

  return grads

In [None]:
# Updating the parameters
def update_params(params, grads, learning_rate):
  n = len(params)//2
  for i in range(1, n+1):

    params['W'+str(i)] = params['W'+str(i)] -  learning_rate*grads['dW'+str(i)]
    params['b'+str(i)] = params['b'+str(i)] -  learning_rate*grads['db'+str(i)]

  return params

In [None]:
def nn_model(X, Y, layers_dim,n_iters, learning_rate):

  params = initialize_lparams(layers_dim) #parameters initialization

  for i in range(0, n_iters):

    AL, caches= forward_prop(X, params, layers_dim) # forward prop
    cost = compute_cost(AL, Y) # cost calc

    if i%10 == 0:
        print(f"Cost after {i} iterations: {cost}")
        print(f"Accuraxy after {i} iterations: {get_accuracy(np.argmax(AL,0), np.argmax(Y, axis=0))}")

    grads = back_prop(AL, Y, caches) #back-prop
    params = update_params(params, grads, learning_rate)# update parameters

  preds, _ = forward_prop(X, params, layers_dim)
  return preds, params

In [None]:
# transforming (1,m) shaped y array to (10, m) shaped matrix with each columns having a values that corresponds to the probability to belonging to that class
def one_hot_encode(labels):
    y = np.zeros((labels.size, labels.max()+1))
    y[np.arange(y_train.size), labels] = 1
    return y.T

Y =one_hot_encode(y_train)

In [None]:
def get_accuracy(pred, real):
    return np.sum(pred == real)/real.size

In [None]:
np.random.seed(41)
layer_dims = [784, 16, 12, 10]
preds, params = nn_model(x_train, Y, layer_dims, 3000, 0.01)

Cost after 0 iterations: 0.32503830379045345
Accuraxy after 0 iterations: 0.11501666666666667
Cost after 10 iterations: 0.32413039120462717
Accuraxy after 10 iterations: 0.18613333333333335
Cost after 20 iterations: 0.3202616047435505
Accuraxy after 20 iterations: 0.22216666666666668
Cost after 30 iterations: 0.2937691901148102
Accuraxy after 30 iterations: 0.2549166666666667
Cost after 40 iterations: 0.25692872666539635
Accuraxy after 40 iterations: 0.3385666666666667
Cost after 50 iterations: 0.23083328622063637
Accuraxy after 50 iterations: 0.52855
Cost after 60 iterations: 0.31384003025346685
Accuraxy after 60 iterations: 0.1952
Cost after 70 iterations: 0.1601195599765336
Accuraxy after 70 iterations: 0.6992166666666667
Cost after 80 iterations: 0.21988727838814745
Accuraxy after 80 iterations: 0.55025
Cost after 90 iterations: 0.1749342094020252
Accuraxy after 90 iterations: 0.6173333333333333
Cost after 100 iterations: 0.12809192420395352
Accuraxy after 100 iterations: 0.7481833

In [None]:
def predict(X, params, layer_dims=layer_dims):
    preds, _= forward_prop(X, params, layer_dims)
    labels = np.argmax(preds,0)

    return labels

In [None]:
y_pred = predict(x_test, params)

In [None]:
get_accuracy(y_pred, y_test)

0.9331