In [3]:
import numpy as np
from keras.datasets import fashion_mnist 
import matplotlib.pyplot as plt

**Data Pre-Processing**

In [4]:
(X, y), (X_test, y_test) = fashion_mnist.load_data()
X = X.reshape(X.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
print(X.shape)
print(y.shape)

#One Hot Encoding for y
def one_hot_encode(y):
  encoded_array = np.zeros((y.size, y.max()+1), dtype=int)
  encoded_array[np.arange(y.size),y] = 1 
  return encoded_array

X = X/255
X_test = X_test/255
y = one_hot_encode(y)
y_test = one_hot_encode(y_test)


X = X.T
X_test = X_test.T
y = y.T

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
(60000, 784)
(60000,)


**Activation Functions**

In [5]:
def sigmoid(z):
    #print(-z)
    return 1 / (1 + np.exp(-(z)))

def tanh(z):
    return np.tanh(z)

def relu(z):
    return (z>0)*(z) + ((z<0)*(z)*0.01)

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

**Weight Initialisation**

In [6]:
def initialize_parameters(input_size, n, output_size):
    parameters = {}
    parameters['W'+str(1)] = np.random.randn(n[0],input_size)*0.01
    parameters['b'+str(1)] = np.random.randn(n[0],1)
    for i in range(1,len(n)):
        parameters['W'+str(i+1)] = np.random.randn(n[i],n[i-1])*0.01
        parameters['b'+str(i+1)] = np.random.randn(n[i],1)
    parameters['W'+str(len(n)+1)] = np.random.randn(output_size,n[-1])*0.01
    parameters['b'+str(len(n)+1)] = np.random.randn(output_size,1)
    return parameters

**Forward Propagation**

In [7]:
def linear(W, X, b, activation_func):
    #print(f"W Shape = {W.shape}, X Shape= {X.shape}, W= {W}, X = {X}, b = {b} " )
    h = np.matmul(W,X)+b
    if activation_func == 'sigmoid':
        #print(h)
        a = sigmoid(h)
    elif activation_func == 'relu':
        a = relu(h)
    elif activation_func == 'tanh':
        a = tanh(h)
    elif activation_func == 'softmax':
        a = softmax(h)
    return h,a

def ForwardPropagation(X, parameters, activation_func):
    layer_wise_outputs = {}
    layer_wise_outputs['h1'], layer_wise_outputs['a1'] = linear(parameters['W1'], X, parameters['b1'], activation_func[0])
    for i in range(1, (len(parameters)//2)):
        layer_wise_outputs['h'+str(i+1)], layer_wise_outputs['a'+str(i+1)] = linear(parameters['W'+str(i+1)],layer_wise_outputs['a'+str(i)],parameters['b'+str(i+1)], activation_func[i])
    return layer_wise_outputs

**Loss Functions**

In [8]:
def MSELoss(Y, Y_pred):
    MSE = np.mean((Y - Y_pred) ** 2)
    return MSE

def CrossEntropyLoss(Y, Y_pred):
    CE = [-Y[i] * np.log(Y_pred[i]) for i in range(len(Y_pred))]
    crossEntropy = np.mean(CE)
    return crossEntropy

def cost(Y, Y_pred, loss_func):
    if (loss_func == 'MSE'):
        return (MSELoss(Y, Y_pred))
    elif (loss_func == 'CE'):
        return (CrossEntropyLoss(Y, Y_pred))


**Back Propagation**

In [9]:
def ActivationBackward(dA, Z, activation_func) :
    
    if (activation_func == 'sigmoid'):
        grad = sigmoid(Z)*(1-sigmoid(Z))
       
    elif (activation_func == 'relu'):
        grad = np.where(Z>0, 1, 0)
        
    elif (activation_func == 'tanh'):
        grad = 1 - tanh(Z)**2
    elif (activation_func == 'softmax'):
        grad = softmax(Z) * (1-softmax(Z))
    dZ = dA * grad
    return dZ

def softmax_derivative(x):
    return softmax(x) * (1-softmax(x))        
    
def LayerBackward(dZl, Wl, bl, A_prev):
    
    m = A_prev.shape[1]
    
    dWl = (1/m) * np.matmul(dZl, A_prev.T)
    dbl = (1/m)* np.sum(dZl, axis=1, keepdims=True)
    dA_prev = np.matmul(Wl.T,dZl)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dWl.shape == Wl.shape)
    assert (dbl.shape == bl.shape)
    return dWl, dbl, dA_prev
   
def BackPropagate(parameters, layer_wise_outputs,X, Y, activation_func, loss):
    gradients = {}
    l = len(layer_wise_outputs)//2
    m = Y.shape[1]
    AL = layer_wise_outputs['a'+str(l)]
    HL = layer_wise_outputs['h'+str(l)]
    
    if loss == 'CE':
        gradients['dh'+str(l)] = AL-Y
    elif loss == 'MSE':
        gradients['dh'+str(l)] = (AL-Y) * softmax_derivative(HL)
        
    for i in range(l-1,0,-1):
        gradients['dW'+str(i+1)],gradients['db'+str(i+1)],gradients['da'+str(i)] = LayerBackward(gradients['dh'+str(i+1)], parameters['W'+str(i+1)], parameters['b'+str(i+1)], layer_wise_outputs['a'+str(i)])
        gradients['dh'+str(i)] = ActivationBackward(gradients['da'+str(i)], layer_wise_outputs['h'+ str(i)] , activation_func[i-1])
        
    gradients['dW'+str(1)],gradients['db'+str(1)],gradients['da'+str(0)] = LayerBackward(gradients['dh'+str(1)], parameters['W'+str(1)], parameters['b'+str(1)], X)    
    
    return gradients


# parameters = initialize_parameters(2, [1,2,3], 2)
# activation_func = ['sigmoid','sigmoid','sigmoid','softmax']
# X = np.array([1,2]).reshape(2,1)  
# Y = np.array([1,2]).reshape(2,1)
# loss = 'CE'

# layer_wise_outputs = ForwardPropagation(X, parameters, activation_func)
# print(layer_wise_outputs)
# print(BackPropagate(parameters, layer_wise_outputs, X, Y, activation_func, loss))
# print(parameters['W3'].shape)
# print(BackPropagate(parameters, layer_wise_outputs, X, Y, activation_func, loss)['dW3'].shape)

**Optimisers**

In [10]:
def sgd(parameters,gradients,learning_rate):
    L = len(parameters) // 2 
    for l in range(1, L + 1):
        parameters["W" + str(l)] = parameters["W" + str(l)] - learning_rate * gradients["dW" + str(l)]
        parameters["b" + str(l)] = parameters["b" + str(l)] - learning_rate * gradients["db" + str(l)]
    return parameters

def mgd(parameters, gradients, learning_rate, beta, previous_updates):
     L = len(parameters) // 2 
     for l in range(1, L + 1):
        previous_updates["W"+str(l)] = 0
        previous_updates["b"+str(l)] = 0
     for l in range(1, L + 1):
        previous_updates["W"+str(l)] = beta*previous_updates["W"+str(l)] + (1-beta)*gradients["dW" + str(l)]
        parameters["W" + str(l)] = parameters["W" + str(l)] - learning_rate * previous_updates["W"+str(l)]
        previous_updates["b"+str(l)] = beta*previous_updates["b"+str(l)] + (1-beta)*gradients["db" + str(l)]
        parameters["b" + str(l)] = parameters["b" + str(l)] - learning_rate * previous_updates["b"+str(l)]
     return parameters

def nesterov(parameters,gradients,learning_rate, beta, )




**Training Our Model on Fashion-MNIST**


In [34]:
#Model Architechture
n = [784,[64,32],10]
activation_func = ['sigmoid','sigmoid','softmax']
loss = 'CE'
batch_size = 200
learning_rate = 0.25
epochs = 30

#
m = X.shape[1]
parameters = initialize_parameters(n[0],n[1],n[2])
count = 0

#loss array 
# losses = np.array([])

while count < epochs :
  training_loss = 0
  count = count+1
  previous_updates = {}
  for i in np.arange(0, X.shape[1], batch_size):
    batch_count = batch_size
    if i + batch_size > X.shape[1]:
      batch_count = X.shape[1] - i + 1
    batch_size = batch_count
    layer_wise_outputs = ForwardPropagation(X[:,i:i+batch_size], parameters, activation_func)  
    gradients = BackPropagate(parameters, layer_wise_outputs, X[:,i:i+batch_size], y[:,i:i+batch_size], activation_func, loss)
    # parameters = mgd(parameters,gradients,learning_rate,0.2,previous_updates)
    parameters = sgd(parameters,gradients,learning_rate)
    losses = np.append(losses, cost(y[:,i:i+batch_size], layer_wise_outputs['a'+str(len(n))],loss))
    training_loss = training_loss + cost(y[:,i:i+batch_size], layer_wise_outputs['a'+str(len(n))],loss)
  print("Loss after "+ str(count) +"th epoch =" +str(training_loss*(batch_size)/m))



Loss after 1th epoch =0.23085865607662218
Loss after 2th epoch =0.22995540218201152
Loss after 3th epoch =0.1952936231283036
Loss after 4th epoch =0.15742045627571466
Loss after 5th epoch =0.13673232254235865
Loss after 6th epoch =0.10294145145691803
Loss after 7th epoch =0.08296023919947375
Loss after 8th epoch =0.07104423060796418
Loss after 9th epoch =0.06268464180247475
Loss after 10th epoch =0.056950837986175375
Loss after 11th epoch =0.0528853940208869
Loss after 12th epoch =0.04999485644650049
Loss after 13th epoch =0.04780004552293244
Loss after 14th epoch =0.046059707395922515
Loss after 15th epoch =0.044613069856188556
Loss after 16th epoch =0.043358746874043314
Loss after 17th epoch =0.04224194305819924
Loss after 18th epoch =0.04123319586554396
Loss after 19th epoch =0.040316774595407764
Loss after 20th epoch =0.03948356009736844
Loss after 21th epoch =0.03872726228681425
Loss after 22th epoch =0.0380416318681412
Loss after 23th epoch =0.03741696394361086
Loss after 24th ep

In [35]:
test_outputs = ForwardPropagation(X_test, parameters, activation_func)

def softmax_to_label(softmax_output):
    max_index = np.argmax(softmax_output, axis = 0)
    return max_index

test_outputs['a'+str(len(n))] = softmax_to_label(test_outputs['a'+str(len(n))])


def accuracy_score(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    accuracy = correct / total
    return accuracy

print(softmax_to_label(y_test.T).shape)
print(test_outputs['a'+str(len(n))].shape)
y_test_fin = softmax_to_label(y_test.T)
print(f"Test Accuracy = {100*accuracy_score( y_test_fin, test_outputs['a'+str(len(n))])} %")
    

(10000,)
(10000,)
Test Accuracy = 86.27 %
