In [19]:
import numpy as np
from typing import Tuple, Optional, Union, List
from enum import Enum
from abc import ABC, abstractmethod
from mlxtend.data import loadlocal_mnist
from sklearn.model_selection import train_test_split
import os

In [20]:
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)

def softmax(Z):
    e_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return e_Z / e_Z.sum(axis = 1, keepdims=True)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

In [21]:
class Activation(Enum):
    RELU = ("relu", relu, relu_backward)
    SIGMOID = ("sigmoid", sigmoid, sigmoid_backward)
    SOFTMAX = ("softmax", softmax, lambda dA_curr, Z_curr: dA_curr)

In [22]:
class Layer(ABC):
    
    @abstractmethod
    def build(self, input_shape: tuple) -> tuple:
        pass
    
    @abstractmethod
    def forward_pass(self, input: np.array) -> np.array:
        pass
        
    @abstractmethod
    def backward_pass(self, input: np.array) -> np.array:
        pass
    
    def _init_weights(self, shape: tuple) -> np.array:
        return np.random.normal(0.0, 0.1, shape)

In [23]:
class Convolution(Layer):
    def __init__(self, 
                 filters: int, 
                 kernel_size: Tuple[int, int, int], 
                 padding: int, 
                 stride: int, 
                 activation: Activation):
        pass
    
    def build(self, input_shape: tuple) -> tuple:
        pass
    
    def forward_pass(self, input: np.array) -> np.array:
        pass
    
    def backward_pass(self, input: np.array) -> np.array:
        pass

In [24]:
class MaxPool(Layer):
    def __init__(self):
        pass
    
    def build(self, input_shape: tuple) -> tuple:
        pass
    
    def forward_pass(self, input: np.array) -> np.array:
        pass
    
    def backward_pass(self, input: np.array) -> np.array:
        pass

In [25]:
class GlobalAveragePool(Layer):
    def __init__(self):
        pass
    
    def build(self, input_shape: tuple) -> tuple:
        pass
    
    def forward_pass(self, input: np.array) -> np.array:
        pass
    
    def backward_pass(self, input: np.array) -> np.array:
        pass

In [26]:
def adam(w, dw, config=None):
    """
    Uses the Adam update rule, which incorporates moving averages of both the
    gradient and its square and a bias correction term.
    config format:
    - learning_rate: Scalar learning rate.
    - beta1: Decay rate for moving average of first moment of gradient.
    - beta2: Decay rate for moving average of second moment of gradient.
    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
    - m: Moving average of gradient.
    - v: Moving average of squared gradient.
    - t: Iteration number.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-3)
    config.setdefault('beta1', 0.9)
    config.setdefault('beta2', 0.999)
    config.setdefault('epsilon', 1e-8)
    config.setdefault('m', np.zeros_like(w))
    config.setdefault('v', np.zeros_like(w))
    config.setdefault('t', 0)

    next_w = None
    ###########################################################################
    # TODO: Implement the Adam update formula, storing the next value of w in #
    # the next_w variable. Don't forget to update the m, v, and t variables   #
    # stored in config.                                                       #
    #                                                                         #
    # NOTE: In order to match the reference output, please modify t _before_  #
    # using it in any calculations.                                           #
    ###########################################################################
    eps, learning_rate = config['epsilon'], config['learning_rate']
    beta1, beta2 = config['beta1'], config['beta2']
    m, v, t = config['m'], config['v'], config['t']
    # Adam
    t = t + 1
    m = beta1 * m + (1 - beta1) * dw          # momentum
    mt = m / (1 - beta1**t)                   # bias correction
    v = beta2 * v + (1 - beta2) * (dw * dw)   # RMSprop
    vt = v / (1 - beta2**t)                   # bias correction
    next_w = w - learning_rate * mt / (np.sqrt(vt) + eps)
    # update values
    config['m'], config['v'], config['t'] = m, v, t
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################

    return next_w, config

In [50]:
class Dense(Layer):
    def __init__(self, 
                 units: int, 
                 activation: Activation):
        
        self._units = units
        self._activation = activation
        
        self._A = None
        self._W = None
        self._b = None
        self._Z = None
        
        self._dA = None
        self._dW = None
        self._db = None
        self._dZ = None
        
        self._W_config = None
        self._b_config = None
        
    def __str__(self):
        return "Dense: {: <25} {: <25} {: <25}".format(
            "W.shape={}".format(self._W.shape), 
            "b.shape={}".format(self._b.shape), 
            "activation={}".format(self._activation.value[0])
        )
    
    def build(self, input_shape: tuple) -> tuple:
        self._W = self._init_weights((input_shape[1], self._units))
        self._b = self._init_weights((self._units,))
        return (None, self._units)
    
    def forward_pass(self, input: np.array) -> np.array:
        self._A = input
        
        self._Z = input.dot(self._W) + self._b
        return self._activation.value[1](self._Z)
    
    def backward_pass(self, input: np.array) -> np.array:
        N = input.shape[0]
        self._dZ = self._activation.value[2](input, self._Z)
        
        self._dW = self._A.T.dot(self._dZ)
        self._db = self._dZ.sum(axis=0)
        self._dA = self._dZ.dot(self._W.T) 
        return self._dA
    
    def update(self, lr: float) -> None:
#         self._W -= lr * self._dW
#         self._b -= lr * self._db
        
        self._W, self._W_config = adam(self._W, self._dW, self._W_config)
        self._b, self._b_config = adam(self._b, self._db, self._b_config)

In [51]:
class Model:
    def __init__(self, layers: Layer):
        
        self._layers = layers
        self._compiled = False
    
    def build(self, input_shape: tuple) -> None:
        shape = input_shape
        for layer in self._layers:
            shape = layer.build(shape)
        self._compiled = True
            
    def summary(self):
        if not self._compiled:
            return
        
        for layer in self._layers:
            print(layer)
            print('-' * 80)
            
    def fit(self, X: np.array, y: np.array, epochs: int, lr: float, batch_size: int) -> None:
        
        for i in range(epochs):
            Y_hat = self._forward_pass(X=X)
            loss, grads = self.softmax_loss(Y_hat, y)
            print("LOSS:", loss)
            print("ACC:", self._multi_class_accuracy(
                Y_hat, y))

            self._backward_pass(grads=grads)
            self._update(lr=lr)
            
    def _forward_pass(self, X: np.array) -> np.array:
        activations = X
        for layer in self._layers: 
            activations = layer.forward_pass(input=activations)
        return activations
    
    def _backward_pass(self, grads: np.array) -> None:
        activations = grads
        for layer in reversed(self._layers): 
            activations = layer.backward_pass(input=activations)
            
    def _update(self, lr: float) -> None:
        for layer in self._layers: 
            layer.update(lr=lr)
    
    def _multi_class_cross_entropy_loss(self, Y_hat, Y):
        N = Y_hat.shape[0]
        Y = one_hot_encoding(Y)
        loss = - np.sum(np.log(Y_hat) * Y) / N
        return loss
    
    def _multi_class_accuracy(self, Y_hat, Y):
        n_values = Y_hat.shape[1]
        values = Y_hat.argmax(axis=1)
        Y_hat_one_hot = np.eye(n_values)[values]
        return (Y_hat_one_hot == one_hot_encoding(Y)).all(axis=1).mean()
    
    def svm_loss(self, x, y):
        """
        Computes the loss and gradient using for multiclass SVM classification.

        Inputs:
        - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
          class for the ith input.
        - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
          0 <= y[i] < C

        Returns a tuple of:
        - loss: Scalar giving the loss
        - dx: Gradient of the loss with respect to x
        """
        N = x.shape[0]
        correct_class_scores = x[np.arange(N), y]
        margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
        margins[np.arange(N), y] = 0
        loss = np.sum(margins) / N
        num_pos = np.sum(margins > 0, axis=1)
        dx = np.zeros_like(x)
        dx[margins > 0] = 1
        dx[np.arange(N), y] -= num_pos
        dx /= N
        return loss, dx

    
    def softmax_loss(self, x, y):
        """
        Computes the loss and gradient for softmax classification.

        Inputs:
        - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
          class for the ith input.
        - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
          0 <= y[i] < C

        Returns a tuple of:
        - loss: Scalar giving the loss
        - dx: Gradient of the loss with respect to x
        """
        probs = np.exp(x - np.max(x, axis=1, keepdims=True))
        probs /= np.sum(probs, axis=1, keepdims=True)
        N = x.shape[0]
        loss = -np.sum(np.log(probs[np.arange(N), y])) / N
        dx = probs.copy()
        dx[np.arange(N), y] -= 1
        dx /= N
        return loss, dx


## Settings

In [52]:
# number of samples in the data set
N_SAMPLES = 1000
# ratio between training and test sets
TEST_SIZE = 0.1
# size of the photo
PHOTO_SIZE = 28
# number of pixels in the photo
PIXEL_NUMBER = PHOTO_SIZE * PHOTO_SIZE
# number of train epochs
EPOCHS = 1000
# learning rate value
LR = 0.1

## Data

In [53]:
def download_mnist_dataset():
    # The MNIST data set is available at http://yann.lecun.com, let's use curl to download it
    if not os.path.exists("train-images-idx3-ubyte"):
        !curl -O http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
        !curl -O http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
        !curl -O http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
        !curl -O http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
        !gunzip t*-ubyte.gz
        
    # Let's use loadlocal_mnist available in mlxtend.data to get data in numpy array form.
    X1, y1 = loadlocal_mnist(
        images_path="train-images-idx3-ubyte", 
        labels_path="train-labels-idx1-ubyte")

    X2, y2 = loadlocal_mnist(
        images_path="t10k-images-idx3-ubyte", 
        labels_path="t10k-labels-idx1-ubyte")
    
    # We normalize the brightness values for pixels
    X1 = X1.reshape(X1.shape[0], -1) / 255
    X2 = X2.reshape(X2.shape[0], -1) /255

    # Combine downloaded data bundles
    X = np.concatenate([X1, X2])
    y = np.concatenate([y1, y2])
    
    return X, y

In [72]:
def one_hot_encoding(y):
    n_values = np.max(y) + 1
    return np.eye(n_values)[y]

In [85]:
def prepare_data(split_percentage):
    # Download data
    X, y = download_mnist_dataset()
    # One hot encode labels
#     y = one_hot_encoding(y)
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_percentage, random_state=42)
    return X_train, X_test, y_train, y_test

In [86]:
X_train, X_test, y_train, y_test = prepare_data(TEST_SIZE)

In [87]:
X_train = X_train[:2000]
y_train = y_train[:2000]

In [88]:
X_train.shape

(2000, 784)

In [89]:
y_train.shape

(2000,)

## Model

In [90]:
layers = [
    Dense(units=1000, activation=Activation.RELU),
    Dense(units=1000, activation=Activation.RELU),
    Dense(units=500, activation=Activation.RELU),
    Dense(units=500, activation=Activation.RELU),
    Dense(units=10, activation=Activation.RELU)
]

model = Model(layers=layers)
model.build(input_shape=(None, PIXEL_NUMBER))
model.summary()

Dense: W.shape=(784, 1000)       b.shape=(1000,)           activation=relu          
--------------------------------------------------------------------------------
Dense: W.shape=(1000, 1000)      b.shape=(1000,)           activation=relu          
--------------------------------------------------------------------------------
Dense: W.shape=(1000, 500)       b.shape=(500,)            activation=relu          
--------------------------------------------------------------------------------
Dense: W.shape=(500, 500)        b.shape=(500,)            activation=relu          
--------------------------------------------------------------------------------
Dense: W.shape=(500, 10)         b.shape=(10,)             activation=relu          
--------------------------------------------------------------------------------


In [91]:
model.fit(X=X_train, y=y_train, epochs=EPOCHS, lr=LR, batch_size=128)

LOSS: 8.474195808943529
ACC: 0.11
LOSS: 7.006311204467037
ACC: 0.1695
LOSS: 5.853604175800638
ACC: 0.1675
LOSS: 2.1801095140868263
ACC: 0.19
LOSS: 2.125027924504818
ACC: 0.1865
LOSS: 2.137466892511135
ACC: 0.184
LOSS: 2.1169416190799946
ACC: 0.1875
LOSS: 2.093895004847934
ACC: 0.1975
LOSS: 2.0791669939985225
ACC: 0.2035
LOSS: 2.0837937535793825
ACC: 0.208
LOSS: 2.0874939754544486
ACC: 0.2085
LOSS: 2.073972188238211
ACC: 0.2085
LOSS: 2.065692435748252
ACC: 0.207
LOSS: 2.0659438655877596
ACC: 0.205
LOSS: 2.069286290287001
ACC: 0.2045
LOSS: 2.067308884765191
ACC: 0.2045
LOSS: 2.061453864756405
ACC: 0.2055
LOSS: 2.0588426846840955
ACC: 0.207
LOSS: 2.05783896907339
ACC: 0.208
LOSS: 2.0572343471839343
ACC: 0.21
LOSS: 2.057010397971159
ACC: 0.211
LOSS: 2.055258094846513
ACC: 0.211
LOSS: 2.051653668377312
ACC: 0.211
LOSS: 2.050163015334989
ACC: 0.2105
LOSS: 2.050206136045861
ACC: 0.2105
LOSS: 2.0491492708668404
ACC: 0.211
LOSS: 2.0469387795304423
ACC: 0.2115
LOSS: 2.045813896634883
ACC: 0.2115

LOSS: 2.037789249998859
ACC: 0.2145
LOSS: 2.0377892211188144
ACC: 0.2145
LOSS: 2.0377891926816796
ACC: 0.2145
LOSS: 2.037789164757382
ACC: 0.2145
LOSS: 2.0377891374030788
ACC: 0.2145
LOSS: 2.0377891106607655
ACC: 0.2145
LOSS: 2.037789084553975
ACC: 0.2145
LOSS: 2.0377890591131957
ACC: 0.2145
LOSS: 2.037789034369066
ACC: 0.2145
LOSS: 2.0377890103260405
ACC: 0.2145
LOSS: 2.0377889869742103
ACC: 0.2145
LOSS: 2.0377889643107823
ACC: 0.2145
LOSS: 2.037788942329325
ACC: 0.2145
LOSS: 2.037790351722791
ACC: 0.2145
LOSS: 2.0377891085416677
ACC: 0.2145
LOSS: 2.037789306514102
ACC: 0.2145
LOSS: 2.0377895115097893
ACC: 0.2145
LOSS: 2.0377897197259793
ACC: 0.2145
LOSS: 2.037789926750527
ACC: 0.2145
LOSS: 2.037790127950158
ACC: 0.2145
LOSS: 2.0377903186740243
ACC: 0.2145
LOSS: 2.037790494482899
ACC: 0.2145
LOSS: 2.0377906514361874
ACC: 0.2145
LOSS: 2.0377907861937747
ACC: 0.2145
LOSS: 2.0377908963216638
ACC: 0.2145
LOSS: 2.0377909803523924
ACC: 0.2145
LOSS: 2.0377910378005057
ACC: 0.2145
LOSS: 2.037

LOSS: 2.037788531624502
ACC: 0.2145
LOSS: 2.0377885169919585
ACC: 0.2145
LOSS: 2.037788503118758
ACC: 0.2145
LOSS: 2.0377884898218475
ACC: 0.2145
LOSS: 2.0377884770683314
ACC: 0.2145
LOSS: 2.037788464828457
ACC: 0.2145
LOSS: 2.037788453074161
ACC: 0.2145
LOSS: 2.0377884417789995
ACC: 0.2145
LOSS: 2.037789018555037
ACC: 0.2145
LOSS: 2.037788488047785
ACC: 0.2145
LOSS: 2.0377885437381265
ACC: 0.2145
LOSS: 2.0377885976089347
ACC: 0.2145
LOSS: 2.0377886485271643
ACC: 0.2145
LOSS: 2.037788695891717
ACC: 0.2145
LOSS: 2.037788739224293
ACC: 0.2145
LOSS: 2.037788778164177
ACC: 0.2145
LOSS: 2.0377888124607058
ACC: 0.2145
LOSS: 2.037788841967596
ACC: 0.2145
LOSS: 2.0377888666692408
ACC: 0.2145
LOSS: 2.0377888866006297
ACC: 0.2145
LOSS: 2.0377889018877413
ACC: 0.2145
LOSS: 2.0377889127302997
ACC: 0.2145
LOSS: 2.037788919390789
ACC: 0.2145
LOSS: 2.0377889221731174
ACC: 0.2145
LOSS: 2.0377889214094327
ACC: 0.2145
LOSS: 2.037788917448063
ACC: 0.2145
LOSS: 2.037788910643018
ACC: 0.2145
LOSS: 2.037788

LOSS: 2.037788284614629
ACC: 0.2145
LOSS: 2.0377883359808613
ACC: 0.2145
LOSS: 2.03778838719585
ACC: 0.2145
LOSS: 2.037788437463249
ACC: 0.2145
LOSS: 2.037788485623632
ACC: 0.2145
LOSS: 2.037788530953307
ACC: 0.2145
LOSS: 2.037788572847657
ACC: 0.2145
LOSS: 2.037788610806837
ACC: 0.2145
LOSS: 2.0377886444610325
ACC: 0.2145
LOSS: 2.037788673544738
ACC: 0.2145
LOSS: 2.0377886979229243
ACC: 0.2145


KeyboardInterrupt: 