### Illustration of how convolutional layer works

Example from http://cs231n.github.io/convolutional-networks/, I strongly recommend to take a look at that GIF.

In [1]:
import numpy as np

In [2]:
# input (e.g. image) with 5x5 and 3 channels
X = np.zeros((5, 5, 3))

X[:, :, 0] = np.array([
    [1, 2, 1, 0, 0],
    [0, 1, 1, 0, 1],
    [2, 1, 2, 2, 0],
    [0, 2, 1, 1, 1],
    [2, 2, 2, 2, 1]
])

X[:, :, 1] = np.array([
    [2, 1, 0, 2, 2],
    [0, 2, 0, 0, 0],
    [0, 2, 1, 2, 2],
    [2, 2, 2, 1, 2],
    [2, 1, 2, 2, 2]
])

X[:, :, 2] = np.array([
    [0, 1, 1, 0, 2],
    [0, 0, 2, 0, 2],
    [0, 1, 1, 1, 2],
    [2, 2, 0, 1, 0],
    [0, 0, 0, 1, 2]
])

In [3]:
w0 = np.zeros((3, 3, 3))

w0[:, :, 0] = np.array([
    [1, -1, -1],
    [-1, 0, -1],
    [-1, 1, -1]
])

w0[:, :, 1] = np.array([
    [-1, 1, 1],
    [1, -1, 0],
    [0, 1, 1]
])

w0[:, :, 2] = np.array([
    [0, 1, -1],
    [-1, -1, 0],
    [1, 0, 1]
])

In [4]:
w1 = np.zeros_like(w0)

w1[:, :, 0] = np.array([
    [1, -1, 0],
    [1, 0, 1],
    [1, 1, -1]
])

w1[:, :, 1] = np.array([
    [-1, -1, -1],
    [-1, 1, 1],
    [1, 0, -1]
])

w1[:, :, 2] = np.array([
    [0, 0, -1],
    [-1, 1, 0],
    [0, -1, 1]
])

In [5]:
b0 = np.zeros((1, 1, 1))
b0[:, :, 0] = 1

b1 = np.zeros((1, 1, 1))
b1[:, :, 0] = 0

In [6]:
# padding
pad_width = 1
X_pad = np.zeros((X.shape[0] + 2 * pad_width, X.shape[1] + 2 * pad_width, X.shape[2]))
for i in range(X_pad.shape[2]):
    X_pad[:, :, i] = np.pad(X[:, :, i], pad_width, mode = "constant", constant_values = 0)
    
print("Input without padding:\n\n{}\n\n{}\n\n{}".format(X[:, :, 0], X[:, :, 1], X[:, :, 2]))
print("Input with padding:\n\n{}\n\n{}\n\n{}".format(X_pad[:, :, 0], X_pad[:, :, 1], X_pad[:, :, 2]))

Input without padding:

[[ 1.  2.  1.  0.  0.]
 [ 0.  1.  1.  0.  1.]
 [ 2.  1.  2.  2.  0.]
 [ 0.  2.  1.  1.  1.]
 [ 2.  2.  2.  2.  1.]]

[[ 2.  1.  0.  2.  2.]
 [ 0.  2.  0.  0.  0.]
 [ 0.  2.  1.  2.  2.]
 [ 2.  2.  2.  1.  2.]
 [ 2.  1.  2.  2.  2.]]

[[ 0.  1.  1.  0.  2.]
 [ 0.  0.  2.  0.  2.]
 [ 0.  1.  1.  1.  2.]
 [ 2.  2.  0.  1.  0.]
 [ 0.  0.  0.  1.  2.]]
Input with padding:

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  2.  1.  0.  0.  0.]
 [ 0.  0.  1.  1.  0.  1.  0.]
 [ 0.  2.  1.  2.  2.  0.  0.]
 [ 0.  0.  2.  1.  1.  1.  0.]
 [ 0.  2.  2.  2.  2.  1.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  2.  1.  0.  2.  2.  0.]
 [ 0.  0.  2.  0.  0.  0.  0.]
 [ 0.  0.  2.  1.  2.  2.  0.]
 [ 0.  2.  2.  2.  1.  2.  0.]
 [ 0.  2.  1.  2.  2.  2.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  1.  0.  2.  0.]
 [ 0.  0.  0.  2.  0.  2.  0.]
 [ 0.  0.  1.  1.  1.  2.  0.]
 [ 0.  2.  2.  0.  1.  0.  0.]
 [ 0.  0.  0

In [7]:
# compute output size

# input size
W = X.shape[0]

# filter size
F = w0.shape[0]

# pad width
P = pad_width

# stride step
S = 2

# compute output size of conv layer
print("Output size:", (W - F + 2 * P) / S + 1)

Output size: 3.0


In [8]:
output = np.zeros((3, 3, 2))

for i in range(int((W - F + 2 * P) / 2 + 1)):
    for j in range(int((W - F + 2 * P) / 2 + 1)):
        # 2 outputs
        output[i, j, 0] = np.sum(X_pad[(i*S):(i*S+F), (j*S):(j*S+F), :] * w0) + b0[:, :, 0]
        output[i, j, 1] = np.sum(X_pad[(i*S):(i*S+F), (j*S):(j*S+F), :] * w1) + b1[:, :, 0]

In [9]:
print("Ouput 1:\n{}".format(output[:, :, 0]))
print("Ouput 2:\n{}".format(output[:, :, 1]))

Ouput 1:
[[-2. -2.  0.]
 [ 5.  1.  0.]
 [-1. -4. -3.]]
Ouput 2:
[[ 2.  5.  1.]
 [-3.  6.  5.]
 [-1.  2.  0.]]
