In [15]:
import numpy as np

class nn:

  def __init__ (self, layerDim, activations, regulization="", lambd=0, keep_prob=1, minibatch=0, descent="normal", beta=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8):
    self.layerDim = layerDim
    self.activations = activations
    self.layer = len(layerDim)

    self.regulization = regulization
    self.lambd = lambd

    self.warr = []
    self.barr = []
    self.cachesZ = []
    self.cacheA = []

    self.cacheD = []
    self.keep_prob = keep_prob

    self.minibatch = minibatch

    self.descent = descent
    self.vCache = []
    self.sCache = []
    self.beta = beta
    self.beta1 = beta1
    self.beta2 = beta2
    self.epsilon = epsilon
    self.t = 1
    
    self.initParams()


  # def __str__ (self):
  #   return str(self.layerDim) + " " + str(self.activations)

  def initParams (self):
    for i in range(1, self.layer):
        if self.activations[i-1] == 'relu':
            self.warr.append(np.random.randn(self.layerDim[i], self.layerDim[i-1]) * np.sqrt(2. / self.layerDim[i-1]))
        else:  # for sigmoid or tanh
            self.warr.append(np.random.randn(self.layerDim[i], self.layerDim[i-1]) * np.sqrt(1. / self.layerDim[i-1]))
        self.barr.append(np.zeros((self.layerDim[i], 1)))
  
  def sigmoid (self, Z):
    Z = np.clip(Z, -500, 500) 
    return 1/(1+np.exp(-Z))
  
  def relu (self, Z):
    return np.maximum(0, Z)
  
  def tanh (self, Z):
    return np.tanh(Z)
  
  def activationsfunc (self, Z, activation):
    if activation == 'sigmoid':
      return self.sigmoid(Z)
    elif activation == 'relu':
      return self.relu(Z)
    elif activation == 'tanh':
      return self.tanh(Z)
    elif activation == 'softmax':
      return self.softmax(Z)
    else:
      return Z
  
  def activationsDerivative(self, Z, activation):
    if activation == 'sigmoid':
      sig = self.sigmoid(Z)
      return sig * (1 - sig)
    elif activation == 'relu':
      return (Z > 0).astype(Z.dtype)
    elif activation == 'tanh':
      return 1 - np.power(self.tanh(Z), 2)
    else:
      return 1
  
  def singleForward (self, A_prev, W, b, activation, layer=0):
    Z = W @ A_prev + b
    # A = Z
    A = self.activationsfunc(Z, activation)
    if self.regulization == "dropout" and layer != self.layer-2:
      D = np.random.rand(A.shape[0], A.shape[1])
      D = (D < self.keep_prob).astype(int)
      A = A * D
      A = A / self.keep_prob
      self.cacheD.append(D)

    self.cachesZ.append(Z)
    self.cacheA.append(A)
    
    if self.descent == "adam":
      self.vCache.append(np.zeros(W.shape))
      self.sCache.append(np.zeros(W.shape))
    elif self.descent == "momentum":
      self.vCache.append(np.zeros(W.shape))
    elif self.descent == "rmsprop":
      self.vCache.append(np.zeros(W.shape))

    return A

  def forwardProp (self, X):
    A = X
    self.cachesZ = []
    self.cacheA = []
    self.vCache = []
    self.sCache = []
    self.cacheD = [X]
    self.cacheA.append(A)

    for i in range(self.layer-1):
      A = self.singleForward(A, self.warr[i], self.barr[i], self.activations[i], i)
    
    return A
  
  def loss (self, Y, A):
    r = 0
    if self.regulization == 'l2':
      for i in range(self.layer-1):
        r += np.sum(np.square(self.warr[i]))
      r *= self.lambd / (2 * Y.shape[1])

    
    m = Y.shape[1]
    A = np.clip(A, 1e-10, 1 - 1e-10) 

    if self.activations[-1] == 'softmax':
      return - np.sum(Y * np.log(A)) / m + r
    else:
      return - np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A)) / m + r
  
  def lossDerivative (self, Y, A):
    m = Y.shape[1]
    A = np.clip(A, 1e-10, 1 - 1e-10) 
    
    if self.activations[-1] == 'softmax':
      return A - Y
    else:
      return - (np.divide(Y, A) - np.divide(1 - Y, 1 - A))
  
  def singleBackward (self, dA, W, b, Z, A_prev, activation):
    m = A_prev.shape[1]

    dZ = dA * self.activationsDerivative(Z, activation)
    dW = 1/m * (dZ @ A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = W.T @ dZ

    if self.regulization == 'l2':
      dW += self.lambd / m * W
    elif self.regulization == 'dropout':
      dA_prev = dA_prev * self.cacheD.pop()
      dA_prev = dA_prev / self.keep_prob

    return dA_prev, dW, db
  
  def backwardProp (self, Y, A, learning_rate):
    m = Y.shape[1]
    dA = self.lossDerivative(Y, A)

    for i in range(self.layer-1, 0, -1):
      dA, dW, db = self.singleBackward(dA, self.warr[i-1], self.barr[i-1], self.cachesZ[i-1], self.cacheA[i-1], self.activations[i-1])

      if self.descent == "normal":
        self.warr[i-1] -= learning_rate * dW
      elif self.descent == "momentum":
        self.vCache[i-1] = self.beta * self.vCache[i-1] + (1 - self.beta) * dW
        self.warr[i-1] -= learning_rate * self.vCache[i-1]
      elif self.descent == "rmsprop":
        self.sCache[i-1] = self.beta * self.sCache[i-1] + (1 - self.beta) * np.square(dW)
        self.warr[i-1] -= learning_rate * dW / np.sqrt(self.sCache[i-1] + self.epsilon)
      elif self.descent == "adam":
        self.vCache[i-1] = self.beta1 * self.vCache[i-1] + (1 - self.beta1) * dW
        self.sCache[i-1] = self.beta2 * self.sCache[i-1] + (1 - self.beta2) * np.square(dW)
        vCorrected = self.vCache[i-1] / (1 - np.power(self.beta1, self.t))
        sCorrected = self.sCache[i-1] / (1 - np.power(self.beta2, self.t))
        self.t += 1
        self.warr[i-1] -= learning_rate * vCorrected / np.sqrt(sCorrected + self.epsilon)

      self.barr[i-1] -= learning_rate * db

  def oneMinibatchTrain (self, X, Y, learning_rate):
    A = self.predict(X)
    self.backwardProp(Y, A, learning_rate)

  def minibatchTrain (self, X, Y, learning_rate):
    m = X.shape[1]
    noBatch = m // self.minibatch
    for i in range(noBatch):
      X_batch = X[:, i*self.minibatch:(i+1)*self.minibatch]
      Y_batch = Y[:, i*self.minibatch:(i+1)*self.minibatch]
      self.oneMinibatchTrain(X_batch, Y_batch, learning_rate)
    
    if m % self.minibatch != 0:
      X_batch = X[:, noBatch*self.minibatch:]
      Y_batch = Y[:, noBatch*self.minibatch:]
      self.oneMinibatchTrain(X_batch, Y_batch, learning_rate)

  def train (self, X, Y, learning_rate, iterations, print_loss=False):
    if self.minibatch == 0:
      self.minibatch = X.shape[1]
      
    for i in range(iterations):
      self.minibatchTrain(X, Y, learning_rate)
      if i % 100 == 0 and print_loss:
        print(f'Loss after {i} iterations: {self.loss(Y, self.predict(X))}')

  def softmax (self, Z):
    expZ = np.exp(Z - np.max(Z))
    return expZ / expZ.sum(axis=0, keepdims=True)
  
  def predict (self, X):
    return self.forwardProp(X)
  
  def accuracy (self, X, Y):
      A = self.predict(X)
      return np.mean(np.argmax(Y, axis=0) == np.argmax(A, axis=0))

  def precision (self, X, Y):
      A = self.predict(X)
      A = (A == A.max(axis=0, keepdims=1)).astype(int)
      true_positive = np.sum((Y == 1) & (A == 1))
      predicted_positive = np.sum(A == 1)
      return true_positive / predicted_positive if predicted_positive > 0 else 0
  
      

In [16]:
import cv2
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline

In [17]:
mnist = tf.keras.datasets.mnist
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

Y_train_mod = np.zeros((10, Y_train.shape[0]))

for i in range(Y_train.shape[0]):
  Y_train_mod[Y_train[i], i] = 1

Y_test_mod = np.zeros((10, Y_test.shape[0]))

for i in range(Y_test.shape[0]):
  Y_test_mod[Y_test[i], i] = 1

X_train_mod = X_train.reshape(X_train.shape[0], -1).T / 255
X_test_mod = X_test.reshape(X_test.shape[0], -1).T / 255

print("X shape:", X_train.shape)

X shape: (60000, 28, 28)


In [18]:
model = nn([28* 28, 128, 64, 10], ['relu', 'relu', 'softmax'], minibatch=128, regulization="l2", lambd=0.3, descent="momentum", beta=0.9)


model.train(X_train_mod, Y_train_mod, 0.1, 1000, True)

print(model.predict(X_train_mod[:, 0:1]))
print(Y_train_mod[:, 0:1])



Loss after 0 iterations: 0.6052829466673281
Loss after 100 iterations: 0.08779253177471888
Loss after 200 iterations: 0.07130264782932624
Loss after 300 iterations: 0.0664504397251217
Loss after 400 iterations: 0.06442297284211652
Loss after 500 iterations: 0.06328584392106205
Loss after 600 iterations: 0.06255091512168405
Loss after 700 iterations: 0.06200194839479548
Loss after 800 iterations: 0.061613641596161124
Loss after 900 iterations: 0.06133292306306901
[[7.30821917e-06]
 [4.46864153e-06]
 [1.74598111e-05]
 [7.86026431e-02]
 [2.04468830e-09]
 [9.21255229e-01]
 [6.55566051e-07]
 [6.14481043e-05]
 [2.40335006e-05]
 [2.67520728e-05]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [19]:
print("Train accuracy:", model.accuracy(X_train_mod, Y_train_mod))
print("Test accuracy:", model.accuracy(X_test_mod, Y_test_mod))

# print("Train precision:", model.precision(X_train_mod, Y_train_mod))
# print("Test precision:", model.precision(X_test_mod, Y_test_mod))

Train accuracy: 0.9875
Test accuracy: 0.9782


In [20]:
print(model.predict(X_train_mod[:, 0:1]))
print(Y_train_mod[:, 0:1])

[[7.30821917e-06]
 [4.46864153e-06]
 [1.74598111e-05]
 [7.86026431e-02]
 [2.04468830e-09]
 [9.21255229e-01]
 [6.55566051e-07]
 [6.14481043e-05]
 [2.40335006e-05]
 [2.67520728e-05]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [21]:
p = model.predict(X_test_mod[:, 0:1])
i, j = np.unravel_index(p.argmax(), p.shape)
print(i)
print(Y_test[0])

7
7
