In [1]:
import numpy as np
from torchvision.datasets import MNIST

In [2]:
train = MNIST(root=".", train=True, download=True)
train_x = train.data.numpy().astype(np.float32) / 255
train_x = train_x.reshape(len(train_x), -1, 1)
train_y = train.targets.numpy() # .astype(np.float32)


test = MNIST(root=".", train=False, download=True)
test_x = test.data.numpy().astype(np.float32) / 255
test_x = test_x.reshape(len(test_x), -1, 1)
test_y = test.targets.numpy() # .astype(np.float32)

In [3]:
class Module:
    
    def __init__(self):
        self.activation = None
        self.input = []
        self.gradient = dict()
        
    def forward(self, x):
        raise NotImplementedError
        
    def backward(self): 
        raise NotImplementedError

In [4]:
class Linear(Module):
    
    def __init__(self, in_dim, out_dim):
        super(Linear, self).__init__()
        
        self.weights = np.random.randn(out_dim, in_dim) * np.sqrt(2 / in_dim)
        self.bias = np.random.rand(out_dim, 1) * np.sqrt(2 / in_dim)
        
    def forward(self, x):
        self.input.append(x)
        self.activation = self.weights @ x + self.bias
        return self.activation
    
    def backward(self, gradient):
        self.gradient["weights"] = np.array(self.input[0])
        self.gradient["bias"] = np.ones_like(gradient) # is this shape ok?
        
        gradient_x = self.weights
        
        self.gradient["weights"] = gradient @ self.gradient["weights"].T
        self.gradient["bias"] = self.gradient["bias"] * gradient
        gradient_x = gradient_x.T @ gradient
        return gradient_x

In [5]:
class ReLU(Module):
    
    def __init__(self):
        super(ReLU, self).__init__()
        
    def forward(self, x):
        self.input.append(x)
        self.activation = np.where(x < 0, 0, x)
        return self.activation
    
    def backward(self, gradient):
        gradient_x = np.where(self.input[0] > 0, 1.0, 0.0)
        gradient_x = gradient_x * gradient
        return gradient_x

In [6]:
class Softmax(Module):
    
    def __init__(self):
        super(Softmax, self).__init__()
        
    def forward(self, x):
        self.input.append(x)
        self.activation = np.exp(x) / np.sum(np.exp(x))
        return self.activation
    
    def backward(self, gradient):
        # compute gradient with respect to self.activation
        # we don't use self.input here, because of how Softmax derivative works.
        gradient_x = -self.activation @ self.activation.T
        gradient_x[range(gradient_x.shape[0]), range(gradient_x.shape[1])] += self.activation.reshape(-1)
        
        # apply chain rule; multiply with with gradient from the next layer
        gradient_x = gradient_x @ gradient
        
        return gradient_x
        

In [7]:
class CrossEntropyLoss(Module):
    
    def __init__(self):
        super(CrossEntropyLoss, self).__init__()
    
    def forward(self, x, target):
        self.input.append(x)
        self.input.append(target)
        self.activation = -np.log(x.take(target, axis=0))
        return self.activation
    
    def backward(self):
        onehot_target = np.zeros_like(self.input[0])
        onehot_target[self.input[1]] = 1
        gradient = -onehot_target * 1 / self.input[0]
        return gradient

In [8]:
class Model():
    
    def __init__(self, in_dim, out_dim):
        
        self.linear0 = Linear(in_dim, 128)
        self.relu0 = ReLU()
        self.linear1 = Linear(128, 256)
        self.relu1 = ReLU()
        self.linear2 = Linear(256, 128)
        self.relu2 = ReLU()
        self.out = Linear(128, out_dim)
        self.softmax = Softmax()
        self.loss = CrossEntropyLoss()
        
        self.modules = [self.linear0, self.relu0, self.linear1, self.relu1, self.linear2, self.relu2, self.out, self.softmax]
    
    def forward(self, x, target=None):
        for module in self.modules:
            x = module.forward(x)
            if target is None:
                module.gradients = dict()
                module.input = []
                module.activations = None
            
        y = x
        if target is not None:
            l = self.loss.forward(y, target)
            return y, l
        else:
            return y
            
    
    def backward(self):
        g = self.loss.backward()
        for module in self.modules[::-1]:
            g = module.backward(g)
            
    def step(self, lr=0.001):
        for module in self.modules:
            for param_name, gradient_values in module.gradient.items():
                setattr(module, param_name, getattr(module, param_name) - lr * gradient_values)
                
            module.gradients = dict()
            module.input = []
            module.activations = None
        

In [20]:
BATCH_SIZE = 1
N_EPOCHS = 100
LR = 0.001

print(train_x.shape)

(60000, 784, 1)


In [23]:
model = Model(train_x.shape[1], len(np.unique(train_y)))

sample_index = np.random.randint(len(train_x))
sample_x = train_x[sample_index]
sample_y = train_y[sample_index]
print(sample_y)

for i in range(N_EPOCHS):
    print("EPOCH:", i)
    y, l = model.forward(sample_x, sample_y)
    print("[", *["%.3f " % p for p in y.reshape(-1)], "]", sep="")
    
    model.backward()
    model.step(lr=LR)
    print("Loss:", l[0])
    print("-----------------------------------------------------------")

4
EPOCH: 0
[0.093 0.069 0.118 0.080 0.067 0.074 0.256 0.062 0.139 0.041 ]
Loss: 2.703386862476409
-----------------------------------------------------------
EPOCH: 1
[0.092 0.070 0.111 0.078 0.089 0.067 0.250 0.061 0.143 0.040 ]
Loss: 2.4216832176735994
-----------------------------------------------------------
EPOCH: 2
[0.088 0.069 0.099 0.073 0.125 0.059 0.246 0.061 0.145 0.036 ]
Loss: 2.082739251514887
-----------------------------------------------------------
EPOCH: 3
[0.081 0.066 0.085 0.067 0.177 0.051 0.239 0.059 0.142 0.033 ]
Loss: 1.73274312284929
-----------------------------------------------------------
EPOCH: 4
[0.071 0.065 0.070 0.057 0.266 0.041 0.207 0.058 0.138 0.028 ]
Loss: 1.3238017142738778
-----------------------------------------------------------
EPOCH: 5
[0.057 0.058 0.052 0.043 0.389 0.033 0.171 0.054 0.120 0.023 ]
Loss: 0.9440253052041981
-----------------------------------------------------------
EPOCH: 6
[0.043 0.049 0.037 0.031 0.539 0.022 0.120 0.043 0.

[0.000 0.001 0.000 0.000 0.996 0.000 0.001 0.000 0.001 0.000 ]
Loss: 0.003786430998724024
-----------------------------------------------------------
EPOCH: 52
[0.000 0.001 0.000 0.000 0.996 0.000 0.001 0.000 0.001 0.000 ]
Loss: 0.003688513453771797
-----------------------------------------------------------
EPOCH: 53
[0.000 0.001 0.000 0.000 0.996 0.000 0.001 0.000 0.001 0.000 ]
Loss: 0.0035945599348278664
-----------------------------------------------------------
EPOCH: 54
[0.000 0.000 0.000 0.000 0.996 0.000 0.001 0.000 0.001 0.000 ]
Loss: 0.003506663473758732
-----------------------------------------------------------
EPOCH: 55
[0.000 0.000 0.000 0.000 0.997 0.000 0.001 0.000 0.001 0.000 ]
Loss: 0.0034226937477174354
-----------------------------------------------------------
EPOCH: 56
[0.000 0.000 0.000 0.000 0.997 0.000 0.001 0.000 0.001 0.000 ]
Loss: 0.003340585273289531
-----------------------------------------------------------
EPOCH: 57
[0.000 0.000 0.000 0.000 0.997 0.000 0

In [9]:
model = Model(train_x.shape[1], len(np.unique(train_y)))

for i in range(N_EPOCHS):
    print("EPOCH:", i)
    for batch_idx in range(0, len(train_x), BATCH_SIZE):
        y, l = model.forward(train_x[batch_idx], train_y[batch_idx])
        # print("[", *["%.3f " % p for p in y.reshape(-1)], "]", sep="")
        # print("-----------------------------------------------------------")

        model.backward()
        model.step(lr=LR)
        
    print("Loss:", l[0])
    
    test_pred = model.forward(test_x).reshape(-1, 10)
    print("Test accuracy:", np.sum(np.argmax(model.forward(test_x).reshape(-1, 10), axis=1) == test_y) / len(test_x))


(60000, 784, 1)
EPOCH: 0
Loss: 16.11518230369279
Test accuracy: 0.0892
EPOCH: 1


KeyboardInterrupt: 