In [15]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
iris_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_df = pd.read_csv(iris_url, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
iris_np = iris_df.values

In [17]:
map_dict = {'Iris-setosa': 0,
            'Iris-versicolor': 1,
            'Iris-virginica': 2}

map_func = np.vectorize(lambda x: map_dict[x])
iris_np[:, -1] = map_func(iris_np[:, -1])

In [18]:
test_indices = np.concatenate((np.random.choice(np.arange(0, 50), size=10, replace=False), 
                               np.random.choice(np.arange(50, 100), size=10, replace=False), 
                               np.random.choice(np.arange(100, 150), size=10, replace=False)))
train_indices = np.array(list(set(np.arange(0, 150)) - set(test_indices)))

In [19]:
train, test = iris_np[train_indices,:], iris_np[test_indices,:]

def process(dataset):
    X = dataset[:, :-1].astype(float)
    Y = dataset[:, -1].astype(int)
    Y_one_hot = np.zeros((len(Y), max(Y)+1))
    Y_one_hot[np.arange(len(Y)), Y] = 1
    return X, Y_one_hot

X, Y = process(train)

In [20]:
class Layer():
    def __init__(self):
        pass
    def forward(self, input):
        pass
    def backward(self, input_gradient):
        pass

In [21]:
class MultiplyLayer(Layer):
    def __init__(self, input_size, hidden_units):
        self.weights = np.random.normal(size=(input_size, hidden_units))
        self.bias = np.random.normal(size=hidden_units)
        
    def forward(self, input):
        self.input = input
        self.output = np.dot(self.weights.transpose(), self.input) + self.bias
        return self.output
    
    def backward(self, input_gradient):
        self.weights_gradient = np.outer(self.input, input_gradient)
        self.bias_gradient = input_gradient
        self.input_gradient = np.dot(self.weights, input_gradient)
        return self.input_gradient

In [22]:
x0 = X[0]
mul_layer = MultiplyLayer(4, 6)

mul_layer.forward(x0)
print("Outputs\n", mul_layer.output)

mul_layer.backward(np.random.normal(size=6))
print("\nWeights Gradient\n", mul_layer.weights_gradient)
print("\nBias Gradient\n", mul_layer.bias_gradient)
print("\nInput Gradient\n", mul_layer.input_gradient)

Outputs
 [ 0.619703   -0.44433063 -0.7612551   1.8444473   9.63271164 -5.92856361]

Weights Gradient
 [[-12.34079642   2.71921685   9.72436205  -3.79048994  -5.17016929
  -10.04597085]
 [ -8.46917402   1.86612921   6.6735818   -2.60131663  -3.5481554
   -6.89429372]
 [ -3.38766961   0.74645168   2.66943272  -1.04052665  -1.41926216
   -2.75771749]
 [ -0.4839528    0.10663595   0.38134753  -0.14864666  -0.20275174
   -0.39395964]]

Bias Gradient
 [-2.419764    0.53317977  1.90673766 -0.74323332 -1.01375868 -1.96979821]

Input Gradient
 [ 0.1965082  -0.62158196 -2.04255726  7.95507526]


In [23]:
class ReLU(Layer):
    def __init__(self):
        pass
    
    def forward(self, input):
        self.input = input
        map_grad = np.vectorize(lambda x: 1 if x > 0 else 0)
        self.local_gradient = map_grad(self.input).astype(float)
        self.output = np.maximum(np.zeros(input.shape), input)
        return self.output
    
    def backward(self, input_gradient):
        self.gradient = self.local_gradient * input_gradient # in the forward pass, zeros are handled, so the same zeros will still be zeros in the gradients
        return self.gradient

In [24]:
z1 = mul_layer.output
relu = ReLU()

relu.forward(z1)
print("Outputs\n", relu.output)
print("Local Gradient\n", relu.local_gradient)

Outputs
 [0.619703   0.         0.         1.8444473  9.63271164 0.        ]
Local Gradient
 [1. 0. 0. 1. 1. 0.]


In [25]:
class Softmax(Layer):
    def __init__(self):
        pass
    
    def forward(self, input):
        self.input = input
        input_exp = np.exp(self.input)
        sum_exp = sum(input_exp)
        self.output = input_exp / sum_exp
        return self.output
        
    def backward(self, input_gradient):
        output_size = len(self.output)
        self.local_gradient = np.ones((output_size, output_size))
        for i in range(output_size):
            for j in range(output_size):
                if i == j:
                    self.local_gradient[i][j] = self.output[i] - self.output[i]**2
                else:
                    self.local_gradient[i][j] = -1 * self.output[i] * self.output[j]
        self.gradient = np.dot(self.local_gradient, input_gradient)
        return self.gradient

In [26]:
softmax = Softmax()

softmax.forward(z1)
print("Softmax\n", softmax.output)

Softmax
 [1.21740642e-04 4.20079661e-05 3.05980046e-05 4.14319409e-04
 9.99391160e-01 1.74405242e-07]


In [27]:
class CrossEntropyLoss(Layer):
    def __init__(self):
        pass
    
    def forward(self, input, label):
        self.input = input
        self.label = label
        self.loss = -1 * np.dot(self.label, np.log(self.input))
        return self.loss
    
    def backward(self):
        self.gradient = -1 * np.divide(self.label, self.input)
        return self.gradient

In [28]:
loss = CrossEntropyLoss()
softmax = Softmax()
layer1 = MultiplyLayer(4, 3)

print("X\n", X[0])
print("\ny\n", Y[0])

layer1.forward(X[0])
print("\nLayer 1 Output\n", layer1.output)
print("\nLayer 1 Weights\n", layer1.weights)

softmax.forward(layer1.output)
print("\nPrediction\n", softmax.output)

loss.forward(softmax.output, Y[0])
print("\nLoss\n", loss.loss)

X
 [5.1 3.5 1.4 0.2]

y
 [1. 0. 0.]

Layer 1 Output
 [ 6.31501263 -0.54413039 -9.69303735]

Layer 1 Weights
 [[ 0.93308676 -1.29940751 -0.28834114]
 [ 0.83285756  0.87808795 -1.73947774]
 [-0.88856011  1.17889788 -1.43174667]
 [ 1.73589996  1.71314613  0.04210699]]

Prediction
 [9.98951176e-01 1.04871215e-03 1.11515823e-07]

Loss
 0.0010493740636568147


In [29]:
loss.backward()
print("\nGradient of Loss w.r.t Prediction\n", loss.gradient)

softmax.backward(loss.gradient)
print("\nLocal Gradient of Softmax Output w.r.t Layer1 Output\n", softmax.local_gradient)
print("\nGradient of Loss w.r.t Layer1 Output\n", softmax.gradient)

layer1.backward(softmax.gradient)
print("\nWeights Gradient\n", layer1.weights_gradient)
print("\Bias Gradient\n", layer1.bias_gradient)


Gradient of Loss w.r.t Prediction
 [-1.00104992 -0.         -0.        ]

Local Gradient of Softmax Output w.r.t Layer1 Output
 [[ 1.04772363e-03 -1.04761223e-03 -1.11398862e-07]
 [-1.04761223e-03  1.04761235e-03 -1.16947998e-10]
 [-1.11398862e-07 -1.16947998e-10  1.11515810e-07]]

Gradient of Loss w.r.t Layer1 Output
 [-1.04882366e-03  1.04871215e-03  1.11515823e-07]

Weights Gradient
 [[-5.34900068e-03  5.34843195e-03  5.68730695e-07]
 [-3.67088282e-03  3.67049252e-03  3.90305379e-07]
 [-1.46835313e-03  1.46819701e-03  1.56122152e-07]
 [-2.09764733e-04  2.09742429e-04  2.23031645e-08]]
\Bias Gradient
 [-1.04882366e-03  1.04871215e-03  1.11515823e-07]


In [30]:
class NeuralNetwork():
    def __init__(self):
        self.layers = [
            MultiplyLayer(4, 6),
            ReLU(),
            MultiplyLayer(6, 4),
            ReLU(),
            MultiplyLayer(4, 3),
            Softmax()
        ]
        self.loss_func = CrossEntropyLoss()
    
    def forward(self, x, y=None, logging=False):
        z = x
        if logging:
            print("Input: ", z, "Label: ", y)
            
        for layer in self.layers:
            z = layer.forward(z)
            if logging:
                print("\n", type(layer), "\nOutput: ", z, "\nShape: ", z.shape)
                
        if y is not None:
            self.loss = self.loss_func.forward(z, y)
            if logging:
                print("\nLoss: ", self.loss)
                
        return z
    
    def backward(self, logging=False):
        dz = self.loss_func.backward()
        if logging:
            print("dL/dpred: ", dz)
            
        for layer in self.layers[::-1]:
            dz = layer.backward(dz)
            if logging:
                print("\n", type(layer), "\nGradient: ", dz)
    
    def SGD(self, X, y, epochs, logging=False, lr=0.01):
        for e in range(epochs):
            for i in range(len(X)):
                self.forward(X[i], y[i])
                self.backward()
                for layer in self.layers:
                    if isinstance(layer, MultiplyLayer):
                        layer.weights -= lr * layer.weights_gradient
                        layer.bias -= lr * layer.bias_gradient

            if logging:
                n_accurate = 0
                for i in range(len(X)):
                    pred = np.argmax(self.forward(X[i]))
                    label = np.argmax(y[i])
                    if pred == label:
                        n_accurate += 1
                print("Epoch ", e, "\n", "-"*50, "\n")
                print("Accuracy: ", n_accurate/len(X))

In [31]:
nn = NeuralNetwork()

In [32]:
nn.SGD(X, Y, 100, logging=True, lr=.001)

Epoch  0 
 -------------------------------------------------- 

Accuracy:  0.325
Epoch  1 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  2 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  3 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  4 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  5 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  6 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  7 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  8 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  9 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  10 
 -------------------------------------------------- 

Accuracy

In [33]:
X_test, Y_test = process(test)
n_accurate = 0
for i in range(len(X_test)):
    pred = np.argmax(nn.forward(X_test[i]))
    label = np.argmax(Y_test[i])
    if pred == label:
        n_accurate += 1
print("Test Accuracy: ", n_accurate/len(X_test))

Test Accuracy:  0.3333333333333333


In [34]:
class MultiplyLayer2(Layer):
    def __init__(self, n_features, n_units):
        self.weights = np.random.normal(size=(n_features, n_units))
        self.bias = np.random.normal(size=n_units) # add bias to each row
        
    # This time input is a matrix of shape (batch_size, n_features)
    def forward(self, input):
        self.input = input
        self.output = np.dot(self.input, self.weights) + self.bias # numpy broadcast bias to each row
        self.local_weights_gradient = self.input
        self.local_bias_gradient = 1 # technically (batch_size, n_units, n_units) with batch_size identity matrices
        self.local_input_gradient = self.weights
        return self.output # output is a matrix of shape (batch_size, n_units)
    
    def backward(self, upstream_gradient):
        self.weights_gradient = np.einsum('ij,ik->ijk', self.local_weights_gradient, upstream_gradient) # broadcast np.outer(row of input, row of upstream)
        self.bias_gradient = self.local_bias_gradient * upstream_gradient # broadcast np.dot(identity, row of upstream)
        self.input_gradient = np.einsum('jk,ik->ij', self.local_input_gradient, upstream_gradient) # broadcast np.dot(weights, row of upstream)
        return self.input_gradient

In [35]:
batch_size = len(X)

batch_layer1 = MultiplyLayer2(4, 6)
batch_layer1.forward(X)
print("Forward Pass Output: \n", batch_layer1.output.shape)

example_upstream = np.random.normal(size=(batch_size, 6))
batch_layer1.backward(example_upstream)
print("Backward Pass Weight Gradient: \n", batch_layer1.weights_gradient.shape)
print("Backward Pass Bias Gradient: \n", batch_layer1.bias_gradient.shape)
print("Backward Pass Input Gradient: \n", batch_layer1.input_gradient.shape)

Forward Pass Output: 
 (120, 6)
Backward Pass Weight Gradient: 
 (120, 4, 6)
Backward Pass Bias Gradient: 
 (120, 6)
Backward Pass Input Gradient: 
 (120, 4)


In [36]:
class ReLU2(Layer):
    def __init__(self):
        pass
    
    def forward(self, input):
        self.input = input
        map_grad = np.vectorize(lambda x: 1 if x > 0 else 0) # elementwise, so works for 2d numpy arrays as well
        self.local_gradient = map_grad(self.input).astype(float)
        self.output = np.maximum(np.zeros(input.shape), input)
        return self.output
    
    def backward(self, input_gradient):
        self.gradient = self.local_gradient * input_gradient # elementwise multiplication
        return self.gradient

In [37]:
z = batch_layer1.output
batch_relu = ReLU2()
batch_relu.forward(z)
print("ReLU Forward Output: \n", batch_relu.output.shape)

example_upstream = np.random.normal(size=(batch_size, 6))
batch_relu.backward(example_upstream)
print("ReLU Backward Gradient: \n", batch_relu.gradient.shape)

ReLU Forward Output: 
 (120, 6)
ReLU Backward Gradient: 
 (120, 6)


In [38]:
class Softmax2(Layer):
    def __init__(self):
        pass
    
    def forward(self, input):
        self.input = input # n by n_units (in our example 120 by 3)
        self.output = np.exp(self.input) / np.sum(np.exp(self.input), axis=1, keepdims=True)
        self.local_gradient = np.zeros((self.input.shape[0], self.input.shape[1], self.input.shape[1]))
        for i in range(self.input.shape[0]):
            for j in range(self.input.shape[1]):
                for k in range(self.input.shape[1]):
                    if j == k:
                        self.local_gradient[i][j][k] = self.output[i][j] * (1 - self.output[i][j])
                    else:
                        self.local_gradient[i][j][k] = -1 * self.output[i][j] * self.output[i][k]
        return self.output
        
    def backward(self, input_gradient):
        self.gradient = np.einsum('ijk,ik->ij', self.local_gradient, input_gradient) # local grad is (n, n_units, n_units) and input grad is (n, n_units), do row-wise dot product
        return self.gradient

In [39]:
z = np.random.normal(size=(batch_size, 3))
batch_softmax = Softmax2()
batch_softmax.forward(z)
print("Softmax Forward Output: \n", batch_softmax.output.shape)

example_upstream = np.random.normal(size=(batch_size, 3))
batch_softmax.backward(example_upstream)
print("Softmax Backward Gradient: \n", batch_softmax.gradient.shape)

Softmax Forward Output: 
 (120, 3)
Softmax Backward Gradient: 
 (120, 3)


In [40]:
class CrossEntropyLoss2(Layer):
    def __init__(self):
        pass
    
    def forward(self, input, label):
        self.input = input # (batch_size, n_classes) softmax output
        self.label = label # (batch_size, n_classes) one-hot encoded 
        self.loss = -1 * np.einsum('ij,ij->i', self.label, np.log(self.input)) # row wise dot product
        self.local_gradient = -1 * (self.label / self.input)
        return self.loss
    
    def backward(self):
        return self.local_gradient

In [41]:
pred = batch_softmax.output
batch_cross_entropy = CrossEntropyLoss2()
batch_cross_entropy.forward(pred, Y)
print("Cross Entropy Loss: \n", batch_cross_entropy.loss.shape)
print("Cross Entropy Backward Gradient: \n", batch_cross_entropy.backward().shape)

Cross Entropy Loss: 
 (120,)
Cross Entropy Backward Gradient: 
 (120, 3)


In [54]:
class NeuralNetwork2():
    def __init__(self):
        self.layers = [
            MultiplyLayer2(4, 6),
            ReLU2(),
            MultiplyLayer2(6, 4),
            ReLU2(),
            MultiplyLayer2(4, 3),
            Softmax2()
        ]
        self.loss_func = CrossEntropyLoss2()
    
    def forward(self, X, Y=None, logging=False):
        Z = X
        if logging:
            print("Input Shape: ", X.shape(), " Label Shape: ", Y.shape())
            
        for layer in self.layers:
            Z = layer.forward(Z)
            if logging:
                print("\n", type(layer), "\nOutput Shape: ", Z.shape)
                
        if Y is not None:
            self.loss = self.loss_func.forward(Z, Y)
            if logging:
                print("\nLoss Shape: ", self.loss.shape)
                
        return Z
    
    def backward(self, logging=False):
        dZ = self.loss_func.backward()
        if logging:
            print("dL/dpred shape: ", dZ.shape)
            
        for layer in self.layers[::-1]:
            dZ = layer.backward(dZ)
            if logging:
                print("\n", type(layer), "\nGradient Shape: ", dZ.shape)
    
    def batch_gd(self, X, Y, epochs, batch_size=None, logging=False, lr=0.01):
        n = len(X)
        if batch_size is None:
            batch_size = n
        for e in range(epochs):
            if logging:
                print("Epoch ", e, "\n", "-"*50, "\n")

            for i in range(0, len(X), batch_size):
                if i + batch_size > n:
                    j = n
                else:
                    j = i + batch_size
                self.forward(X[i:j], Y[i:j])
                self.backward()
                for layer in self.layers:
                    if isinstance(layer, MultiplyLayer2):
                        layer.weights -= lr * np.sum(layer.weights_gradient, axis=0) / batch_size
                        layer.bias -= lr * np.sum(layer.bias_gradient, axis=0) / batch_size

            if logging:
                preds = np.argmax(self.forward(X), axis=1)
                labels = np.argmax(Y, axis=1)
                accuracy = np.mean(np.equal(preds, labels))
                print("Accuracy: ", accuracy)

In [58]:
nn2 = NeuralNetwork2()
nn2.batch_gd(X, Y, 500, batch_size=32, logging=True, lr=0.03)

Epoch  0 
 -------------------------------------------------- 

Accuracy:  0.3333333333333333
Epoch  1 
 -------------------------------------------------- 

Accuracy:  0.43333333333333335
Epoch  2 
 -------------------------------------------------- 

Accuracy:  0.675
Epoch  3 
 -------------------------------------------------- 

Accuracy:  0.6833333333333333
Epoch  4 
 -------------------------------------------------- 

Accuracy:  0.6916666666666667
Epoch  5 
 -------------------------------------------------- 

Accuracy:  0.7
Epoch  6 
 -------------------------------------------------- 

Accuracy:  0.7
Epoch  7 
 -------------------------------------------------- 

Accuracy:  0.725
Epoch  8 
 -------------------------------------------------- 

Accuracy:  0.7416666666666667
Epoch  9 
 -------------------------------------------------- 

Accuracy:  0.7666666666666667
Epoch  10 
 -------------------------------------------------- 

Accuracy:  0.7666666666666667
Epoch  11 
 --------

In [59]:
X_test, Y_test = process(test)
preds = np.argmax(nn2.forward(X_test), axis=1)
labels = np.argmax(Y_test, axis=1)
accuracy = np.mean(np.equal(preds, labels))
print("Accuracy: ", accuracy)

Accuracy:  0.9666666666666667
