In [1]:
import numpy as np
import time

In [2]:
def normalise(v):
    return (v - np.mean(v))/np.std(v)

In [3]:
def softmax(v):
    v = v - np.max(v, axis = 1).reshape(-1,1)
    exp_array = np.exp(v)
    return exp_array / np.sum(exp_array, axis = 1).reshape(-1,1)

In [4]:
def create_filter(n, s):
      return np.zeros((n, s, s))

In [5]:
def relu(map):    
    output_tensor = np.maximum(map, 0)
    return output_tensor

In [6]:
def add_pad(sample, pad):
    n_sample, n, h, w = sample.shape
    
    out_sample = np.zeros((n_sample, n,h+2*pad,w+2*pad))
    
    out_sample[:, :, pad: -pad, pad : -pad] = sample
    return out_sample

In [7]:
def pooling(map, dim):
    n_sample = map.shape[0]
    n_channels = map.shape[1]
    map_size = map.shape[2]
    output_size = map.shape[2]//dim

    return map.reshape(n_sample, n_channels, output_size, dim, output_size, dim).max(axis=(3,5))

In [8]:
def convolution(sample, kernel):
    kernel_size = kernel.shape[1]
    pad = kernel_size//2
    n_sample, n, h, w = sample.shape

    sample = add_pad(sample, pad)
    
    size_feature_map = h
    n_out_channels = kernel.shape[0]

    output_tensor = np.zeros((n_sample, n_out_channels, size_feature_map, size_feature_map))

    for i in range(n_out_channels):
        current_kernel = kernel[i]

        for r in range(size_feature_map):
            for c in range(size_feature_map):
                window = sample[:, :, r : r + kernel_size, c : c + kernel_size]
                value = np.sum(window*current_kernel, axis = (1,2,3))
                output_tensor[:, i, r, c] = value

    return output_tensor

In [9]:
def other_convolution(sample, kernel, pad):
    kernel_size = kernel.shape[1]
    n_sample, n, h, w = sample.shape
    sample = add_pad(sample, pad)
    
    size_feature_map = h + 2*pad - kernel_size + 1
    n_out_channels = kernel.shape[0]

    output_tensor = np.zeros((n_sample, n_out_channels, size_feature_map, size_feature_map))

    for i in range(n_out_channels):
        current_kernel = kernel[i]

        for r in range(size_feature_map):
            for c in range(size_feature_map):
                window = sample[:, :, r : r + kernel_size, c : c + kernel_size]
                value = np.sum(window*current_kernel, axis = (1,2,3))
                output_tensor[:, i, r, c] = value

    return output_tensor

In [10]:
def another_convolution(sample, kernel, pad):
    kernel_size = kernel.shape[1]
    n, h, w = sample.shape
    sample = add_pad(sample, pad)
    
    size_feature_map = h + 2*pad - kernel_size + 1
    n_out_channels = kernel.shape[0]

    output_tensor = np.zeros((n_sample, n_out_channels, size_feature_map, size_feature_map))

    for i in range(n_out_channels):
        current_kernel = kernel[i]

        for r in range(size_feature_map):
            for c in range(size_feature_map):
                window = sample[:, :, r : r + kernel_size, c : c + kernel_size]
                value = np.sum(window*current_kernel, axis = (1,2,3))
                output_tensor[:, i, r, c] = value

    return output_tensor

In [11]:
def full_convolve2d(sample, kernel):
    n, h, w = sample.shape
    kernel_size = kernel.shape[1]
    pad = kernel_size//2
    
    out_sample = np.zeros((n, h, w))
    
    for k in range(n):
        curr_sample = sample[k]
        padded_sample = np.pad(curr_sample, ((pad,pad), (pad,pad)), 'constant')
        # print(padded_sample)

        for i in range(h):
            for j in range(w):
                out_sample[k, i, j] = np.sum(kernel * padded_sample[i:i+kernel_size, j:j+kernel_size])
    
    return out_sample

In [12]:
a = np.array([[[1,2], [3,4]], [[1,2], [3,4]], [[1,2], [3,4]]])
b = np.array([[[0,-1], [9,10]], [[0,-1], [9,10]], [[0,-1], [9,10]]])
full_convolve2d(a, b)

array([[[ 30.,  87.],
        [ 87., 195.]],

       [[ 30.,  87.],
        [ 87., 195.]],

       [[ 30.,  87.],
        [ 87., 195.]]])

In [13]:
class conv2d:
    def __init__(self, num, size):
        self.kernel = np.random.randn(num, size, size)/9 # Xavier Initialisation
        self.bias = np.random.rand(num, size, size)/9
        self.layer_input = None
        self.layer_output = None
        self.layer_activated = None
        self.kernel_grad = None
        self.size = size
    
    def forward_pass(self, sample): # DONE
        self.layer_input = sample

        output_tensor = convolution(sample, self.kernel)
    
        self.layer_output = output_tensor
        self.layer_activated = relu(output_tensor) # relu is totally fine
        
        return self.layer_activated # return the result tensor
        
    def backward_pass(self, inp_grad): # DONE
        n_sample, n, h, w = self.layer_input.shape
        pass_grad = np.zeros((n_sample, n, h, w))
        
        relu_mat = self.layer_activated
        relu_mat[np.nonzero(relu_mat)] = 1
        
        inp_grad = inp_grad * relu_mat # must be dimensionally equivalent
        
        # other_grad = np.sum(inp_grad, axis = 0)/32
        nk, hk, wk = self.kernel.shape
        kernel_grad = np.zeros((n_sample, nk, hk, wk))
        for i in range(n_sample):
            curr_grad = inp_grad[i]
            kernel_grad[i] = other_convolution(self.layer_input[i].reshape(1, n, h, w), curr_grad, self.size//2)[0]
        self.kernel_grad = np.sum(kernel_grad, axis = 0)/32
        
        pass_grad = np.zeros((n_sample, n, h, w))
        flip_kernel = np.flip(self.kernel, axis=(1,2))
        
        for i in range(n_sample):
            curr_grad = inp_grad[i]    
            not_final = full_convolve2d(curr_grad, flip_kernel)
            still_not_final = np.sum(not_final, axis = 0)
            for j in range(n):
                pass_grad[i, j] = still_not_final
        
        return pass_grad
        
    def update(self): # DONE
        # print(self.kernel_grad.shape)
        self.kernel -= 0.01*self.kernel_grad # applying gradient descent
        return None
        

In [14]:
class maxpool2d:
    def __init__(self, dim):
        self.dim = dim
        self.layer_input = None
        self.layer_output = None
        
    def forward_pass(self , sample):
        self.layer_input = sample
        self.layer_output = pooling(sample, 2)
        return self.layer_output

    def backward_pass(self, inp_grad):
        n_sample, n, h, w = self.layer_input.shape
        x = self.layer_input
        
        pass_mat = np.zeros((n_sample, n, h, w))
        
        for i in range(h//2):
            for j in range(w//2):
                h_start = i * 2
                h_end = h_start + 2
                w_start = j * 2
                w_end = w_start + 2
                
                X = self.layer_input
                X_pool = X[:, :, h_start:h_end, w_start:w_end]
                mask = (X_pool == np.max(X_pool, axis=(2, 3))[:, :, None, None])
                pass_mat[:, :, h_start:h_end, w_start:w_end] += mask * (inp_grad[:, :, i, j])[:, :, None, None]
            
        return pass_mat

In [25]:
class fc_1:
    def __init__(self, size, next_size, batch_size = 32):
        self.weights = np.random.randn(next_size, size)/4096
        self.bias = np.random.randn(1, next_size)/4096
        self.weights_grad = None
        self.bias_grad = None
        self.layer_input = None
        self.layer_output = None
        self.layer_output_active = None
        self.batch_size = batch_size
        
    def forward_pass(self, sample):
        self.layer_input = sample
        
        output = sample @ self.weights.T + self.bias # this is fine
        self.layer_output = output
        self.layer_output_active = relu(output)
        
        return self.layer_output_active
    
    def backward_pass(self, inp_grad):
        relu_mat = self.layer_output_active
        relu_mat[np.nonzero(relu_mat)] = 1
        relued_grad = inp_grad * relu_mat
        
        # print("here", relued_grad.shape)
        
        pass_grad = relued_grad @ self.weights
        
        weights_grad = np.zeros((self.batch_size, 64, 4096))
        for i in range(self.batch_size):
            weights_grad[i] = relued_grad[i].reshape(-1,1) @ self.layer_input[i].reshape(1,-1)
        self.weights_grad = np.sum(weights_grad, axis = 0)/self.batch_size
        
        self.bias_grad = np.sum(relued_grad, axis = 0)/self.batch_size
        
        return pass_grad
    
    def update(self):
        self.weights -= 0.01*self.weights_grad
        self.bias -= 0.01*self.bias_grad

In [26]:
class fc_2:
    def __init__(self, size, next_size, batch_size = 32):
        self.weights = np.random.randn(next_size, size)/64
        self.bias = np.random.randn(1, next_size)/64
        self.weights_grad = None
        self.bias_grad = None
        self.layer_input = None
        self.layer_output = None
        self.batch_size = batch_size
        
    def forward_pass(self, sample):
        self.layer_input = sample
        output = sample @ self.weights.T + self.bias
        self.layer_output = output
        return self.layer_output
    
    def backward_pass(self, inp_grad):
        pass_grad = inp_grad @ self.weights
        
        weights_grad = np.zeros((self.batch_size, 10, 64))
        for i in range(self.batch_size):
            weights_grad[i] = inp_grad[i].reshape(-1,1) @ self.layer_input[i].reshape(1,-1)
        self.weights_grad = np.sum(weights_grad, axis = 0)/self.batch_size
        
        self.bias_grad = np.sum(inp_grad, axis = 0)/self.batch_size
        
        return pass_grad
    
    def update(self):
        self.weights -= 0.01*self.weights_grad
        self.bias -= 0.01*self.bias_grad
    

In [17]:
weights_grad = np.zeros((2, 4, 3))
inp_grad = np.array([[1,2,3,4], [5,6,7,8]])
layer_input = np.array([[9,8,7], [-2,-3,-4]])
print(inp_grad[0].reshape(-1,1).shape)

for i in range(2):
    weights_grad[i] = inp_grad[i].reshape(-1,1) @ layer_input[i].reshape(1,-1)
alpha = np.sum(weights_grad, axis = 0)
np.sum(inp_grad, axis = 0).reshape(-1,1)

(4, 1)


array([[ 6],
       [ 8],
       [10],
       [12]])

In [18]:
a = np.array([[1,2], [4,5]])
np.sum(a, axis = 0)

array([5, 7])

In [19]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [20]:
set1 = unpickle("./content/data_batch_1")
set2 = unpickle("./content/data_batch_2")
set3 = unpickle("./content/data_batch_3")
set4 = unpickle("./content/data_batch_4")
set5 = unpickle("./content/data_batch_5")
x_train = np.vstack((set1[b'data'], set2[b'data'], set3[b'data'], set4[b'data'], set5[b'data']))
y_train = np.hstack((np.array(set1[b'labels']), np.array(set2[b'labels']), np.array(set3[b'labels']), np.array(set4[b'labels']) ,np.array(set5[b'labels']) ))
x_train.shape, y_train.shape

((50000, 3072), (50000,))

In [21]:
x_trial = []
y_trial = y_train[0:32]
for i in range(32):
    x_trial.append(x_train[i].reshape(3,32,32))
x_trial = np.array(x_trial)
y_trial = np.array(y_trial)

In [231]:
conv1 = conv2d(32, 3)
pool1 = maxpool2d(2)
conv2 = conv2d(64, 5)
pool2 = maxpool2d(2)
conv3 = conv2d(64, 3)
fc1 = fc_1(4096, 64)
fc2 = fc_2(64, 10)

In [213]:
img1 = x_train[0:32].reshape(32,3,32,32)
y = y_train[0:32]

In [214]:
batch_size = 32

a1 = conv1.forward_pass(img1)
print("a1: ", a1.shape)
# print(a1[0])

a2 = pool1.forward_pass(a1)
print("a2: ", a2.shape)
# print(a2[0])

a3 = conv2.forward_pass(a2)
print("a3: ", a3.shape)
# print(a3[0])

a4 = pool2.forward_pass(a3)
print("a4: ", a4.shape)
# print(a4[0])

a5 = conv3.forward_pass(a4).reshape(batch_size, 4096)
print("a5: ", a5.shape)
# print(a5[0])

a6 = fc1.forward_pass(a5)
print("a6: ", a6.shape)
# print(a6[0])

a7 = fc2.forward_pass(a6)
print("a7: ", a7.shape)
# print(a7[0])

out = softmax(a7)
print("softmax: ", out.shape)

onehot = np.zeros((batch_size, 10))
for i in range(batch_size):
    onehot[i][y[i]] = 1
grad_fc2 = out - onehot
print("grad_fc2: ", grad_fc2.shape)

grad_fc1 = fc2.backward_pass(grad_fc2)
print("grad_fc1: ", grad_fc1.shape)

a1:  (32, 32, 32, 32)
a2:  (32, 32, 16, 16)
a3:  (32, 64, 16, 16)
a4:  (32, 64, 8, 8)
a5:  (32, 4096)
a6:  (32, 64)
a7:  (32, 10)
softmax:  (32, 10)
grad_fc2:  (32, 10)
grad_fc1:  (32, 64)


In [232]:
def run(x, y):
    a1 = conv1.forward_pass(x)
    a2 = pool1.forward_pass(a1)
    a3 = conv2.forward_pass(a2)
    a4 = pool2.forward_pass(a3)
    
    # cannot flatten this directly at this position
    a5 = conv3.forward_pass(a4).reshape(32, 4096)
    # print(a5.shape)
    a6 = fc1.forward_pass(a5)
    a7 = fc2.forward_pass(a6)
    out = softmax(a7)
    # print(out.shape)
    # print(i, "actual: ", y[i],  "prediction: ", np.argmax(out))

    onehot = np.zeros(10)
    onehot[y] = 1
    grad_fc2 = out - onehot
    # print(grad_fc2.shape)

    grad_fc1 = fc2.backward_pass(grad_fc2)
    print(grad_fc1.shape)
    grad_conv3 = fc1.backward_pass(grad_fc1).reshape(32, 64, 8, 8)
    grad_pool2 = conv3.backward_pass(grad_conv3)
    grad_conv2 = pool2.backward_pass(grad_pool2)
    grad_pool1 = conv2.backward_pass(grad_conv2)
    grad_conv1 = pool1.backward_pass(grad_pool1)
    init_grad = conv1.backward_pass(grad_conv1)

    fc2.update()
    fc1.update()
    conv3.update()
    conv2.update()
    conv1.update()

In [233]:
y_trial

array([6, 9, 9, 4, 1, 1, 2, 7, 8, 3, 4, 7, 7, 2, 9, 9, 9, 3, 2, 6, 4, 3,
       6, 6, 2, 6, 3, 5, 4, 0, 0, 9])

In [234]:
begin = time.time()
run(x_trial, y_trial)
print(time.time()-begin, " seconds")

(32, 64)
here (32, 64)
18.447256088256836  seconds


In [33]:
conv1 = conv2d(32, 3)
pool1 = maxpool2d(2)
conv2 = conv2d(64, 5)
pool2 = maxpool2d(2)
conv3 = conv2d(64, 3)
fc1 = fc_1(4096, 64)
fc2 = fc_2(64, 10)

In [34]:
def sgd(X, Y):
    begin = time.time()
    idx = np.random.randint(960, size=32)
    
    for i in range(30):
        x = X[idx]
        y = Y[idx]
        
        # print(x.shape)

        x = x.reshape(32, 3, 32, 32)
        
        # print(x.shape)
    
        a1 = conv1.forward_pass(x)
        a2 = pool1.forward_pass(a1)
        a3 = conv2.forward_pass(a2)
        a4 = pool2.forward_pass(a3)

        # cannot flatten this directly at this position
        a5 = conv3.forward_pass(a4).reshape(32, 4096)
        # print(a5.shape)
        a6 = fc1.forward_pass(a5)
        a7 = fc2.forward_pass(a6)
        out = softmax(a7)
        # print(out.shape)
        # print(i, "actual: ", y[i],  "prediction: ", np.argmax(out))

        onehot = np.zeros((32, 10))
        for j in range(32):
            onehot[j][y[j]] = 1
        grad_fc2 = out - onehot
        # print(grad_fc2.shape)

        grad_fc1 = fc2.backward_pass(grad_fc2)
        grad_conv3 = fc1.backward_pass(grad_fc1).reshape(32, 64, 8, 8)
        grad_pool2 = conv3.backward_pass(grad_conv3)
        grad_conv2 = pool2.backward_pass(grad_pool2)
        grad_pool1 = conv2.backward_pass(grad_conv2)
        grad_conv1 = pool1.backward_pass(grad_pool1)
        init_grad = conv1.backward_pass(grad_conv1)

        fc2.update()
        fc1.update()
        conv3.update()
        conv2.update()
        conv1.update()
        
        prediction = np.argmax(out, axis = 1)
        acc = np.sum(prediction == y)/32
        
        print(i, "time: ", time.time()-begin, " seconds")
        print("batch accuracy = ", acc)

In [35]:
sgd(x_train, y_train)

0 time:  17.548564910888672  seconds
batch accuracy =  0.125
1 time:  35.03215408325195  seconds
batch accuracy =  0.0625
2 time:  52.52545619010925  seconds
batch accuracy =  0.09375
3 time:  70.01437592506409  seconds
batch accuracy =  0.125
4 time:  87.49400806427002  seconds
batch accuracy =  0.0625
5 time:  104.96229100227356  seconds
batch accuracy =  0.15625
6 time:  122.4396460056305  seconds
batch accuracy =  0.0625
7 time:  139.91761994361877  seconds
batch accuracy =  0.0625
8 time:  157.99947118759155  seconds
batch accuracy =  0.09375
9 time:  175.62105798721313  seconds
batch accuracy =  0.125
10 time:  193.11893820762634  seconds
batch accuracy =  0.0625
11 time:  210.67724013328552  seconds
batch accuracy =  0.03125
12 time:  228.28853511810303  seconds
batch accuracy =  0.125
13 time:  245.8278729915619  seconds
batch accuracy =  0.0625
14 time:  263.5075452327728  seconds
batch accuracy =  0.0625
15 time:  281.0798919200897  seconds
batch accuracy =  0.0625
16 time:  

KeyboardInterrupt: 

In [36]:
idx = np.random.randint(50000, size=32)
x = x_train[idx]
y = y_train[idx]

# print(x.shape)

x = x.reshape(32, 3, 32, 32)

# print(x.shape)

a1 = conv1.forward_pass(x)
a2 = pool1.forward_pass(a1)
a3 = conv2.forward_pass(a2)
a4 = pool2.forward_pass(a3)

# cannot flatten this directly at this position
a5 = conv3.forward_pass(a4).reshape(32, 4096)
# print(a5.shape)
a6 = fc1.forward_pass(a5)
a7 = fc2.forward_pass(a6)
print(a7)
# out = softmax(a7)

[[6.65051499e+45 6.65069833e+45 6.65063137e+45 6.65071633e+45
  6.65051469e+45 6.65055012e+45 6.65052601e+45 6.65063294e+45
  6.65058867e+45 6.65056834e+45]
 [6.09307724e+45 6.09324522e+45 6.09318387e+45 6.09326171e+45
  6.09307696e+45 6.09310942e+45 6.09308733e+45 6.09318530e+45
  6.09314474e+45 6.09312612e+45]
 [7.40534554e+45 7.40554969e+45 7.40547513e+45 7.40556973e+45
  7.40534520e+45 7.40538465e+45 7.40535781e+45 7.40547687e+45
  7.40542758e+45 7.40540495e+45]
 [3.85659622e+45 3.85670255e+45 3.85666372e+45 3.85671298e+45
  3.85659605e+45 3.85661660e+45 3.85660261e+45 3.85666462e+45
  3.85663895e+45 3.85662716e+45]
 [4.58895481e+45 4.58908133e+45 4.58903512e+45 4.58909375e+45
  4.58895460e+45 4.58897905e+45 4.58896242e+45 4.58903620e+45
  4.58900566e+45 4.58899163e+45]
 [4.93205272e+45 4.93218869e+45 4.93213903e+45 4.93220204e+45
  4.93205250e+45 4.93207877e+45 4.93206089e+45 4.93214019e+45
  4.93210736e+45 4.93209229e+45]
 [5.19660343e+45 5.19674670e+45 5.19669438e+45 5.19676076e