In [6]:
import torch 

Convolutional Neural Networks (CNN) use the convolutional layer to extract features given an input. The output shape of the convolution is affected by the shape of the kernel, whether zero padding and strides is applied. This short tutorial illustrates such issues via pytorch examples.

**See**. V. Dumoulin and F. Visin, <i>A guide to convolution arithmetic for deep learning</i> <b>2016</b>. doi: 10.48550/ARXIV.1603.07285.

In [7]:
# First let’s create a fixed size inverted index. Each line corresponds to a words embedding
# 
# padding_idx is a reserved word that is used as wildcard when an unknown word occurs in text.
embedding = torch.nn.Embedding(10, 5, padding_idx=9)
embedding.weight

Parameter containing:
tensor([[ 1.1468, -0.4076, -1.3490,  1.3407, -0.4617],
        [ 0.2554, -0.4200, -0.9022, -0.3080, -1.4300],
        [-0.2234,  1.1453,  3.5321, -0.7680,  0.5610],
        [ 1.9435,  0.8450,  0.8677,  2.0463, -0.6149],
        [-1.2287, -0.9967,  0.4377,  0.2498, -2.8136],
        [-0.1270,  0.5649, -0.6316, -0.6323,  0.1887],
        [ 1.6070,  0.8327, -1.4601,  0.7082,  1.0071],
        [ 0.2133,  2.2039, -0.5650,  0.2053,  0.1076],
        [ 0.4367,  1.4659, -0.4419, -0.5802,  1.8514],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]], requires_grad=True)

In [10]:
# For debuggin purposes I'll create a synthetic embedding table 
for i in range(1,11):
    embedding.weight[i-1,:] = i
embedding.weight

Parameter containing:
tensor([[ 1.,  1.,  1.,  1.,  1.],
        [ 2.,  2.,  2.,  2.,  2.],
        [ 3.,  3.,  3.,  3.,  3.],
        [ 4.,  4.,  4.,  4.,  4.],
        [ 5.,  5.,  5.,  5.,  5.],
        [ 6.,  6.,  6.,  6.,  6.],
        [ 7.,  7.,  7.,  7.,  7.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 9.,  9.,  9.,  9.,  9.],
        [10., 10., 10., 10., 10.]], grad_fn=<CopySlices>)

**1D Convolution**

In [11]:
# Assume a sequence of words labeled as integers, i.e. [1,2,3]. The invertex
# index is used transform each integer to its vector form
_1DT = embedding(torch.tensor([1, 2, 3]))

# Estimate the transpose
_1DT = torch.transpose(embedding(torch.tensor([1, 2, 3])),0,1)
print(_1DT)

# The tensor shape
print(_1DT.shape)

tensor([[2., 3., 4.],
        [2., 3., 4.],
        [2., 3., 4.],
        [2., 3., 4.],
        [2., 3., 4.]], grad_fn=<TransposeBackward0>)
torch.Size([5, 3])


In [16]:
# 1-D Zero padding, unit stride (on one dimension), 1 input/output channel
# The convolution output is defined as o = ( i - k ) + 1

# Let d be the number of input channels. In our case, the embedding dimension.
# Let r be the number of output channels. For each channel a kernel of size h is created. Each with different coefficients
# Let h be the size of the kernel.
# Let s be the stride.

d = _1DT.shape[0]
r = 1
h = 2
s = 1

conv = torch.nn.Conv1d(in_channels=d, out_channels=r, kernel_size=h, stride=s,  bias=False)

# To make the convolution effect more visual. Bias terms are disabled from Conv1d and weight 
# coeficients are made 1 as follows
with torch.no_grad():
    conv.weight[:,:,:] = 1.

# 
conv(_1DT.unsqueeze(0))
    
# Shure enough the convolution produced the expected output

tensor([[[25., 35.]]], grad_fn=<SqueezeBackward1>)

Now lets assume two texts of different lengths. The first is padded so that the both have equal lenght.

In [24]:
# Assume a sequence of words labeled as integers, i.e. [1,2,3]. The invertex
# index is used transform each integer to its vector form
_2DT = embedding(torch.tensor([ [0, 1, 2, 9, 9],
                                [3, 4, 5, 7, 8] ]))
# Estimate the transpose
_2DT = torch.transpose(_2DT,1,2)
print(_1DT)

# The tensor shape
print(_2DT.shape)

tensor([[[ 1.,  2.,  3., 10., 10.],
         [ 1.,  2.,  3., 10., 10.],
         [ 1.,  2.,  3., 10., 10.],
         [ 1.,  2.,  3., 10., 10.],
         [ 1.,  2.,  3., 10., 10.]],

        [[ 4.,  5.,  6.,  8.,  9.],
         [ 4.,  5.,  6.,  8.,  9.],
         [ 4.,  5.,  6.,  8.,  9.],
         [ 4.,  5.,  6.,  8.,  9.],
         [ 4.,  5.,  6.,  8.,  9.]]], grad_fn=<TransposeBackward0>)
torch.Size([2, 5, 5])


In [25]:
conv = torch.nn.Conv1d(in_channels=5, out_channels=1, kernel_size=2, stride=1,  bias=False)

# Para propósito de debug. Inicializo la matriz cero a unos. Si el número de 
# canales es mayor que 1. Entocnes es necesario inicializar las matrices de 
# coeficientes adicionales que se crean.
with torch.no_grad():
    conv.weight[0,:,:] = 1.

# Se imprime los coeficientes de la primera matriz.
print("The weight matrix weights are ", conv.weight)

# Se aplica la convolución y se imprime los coeficientes resultantes.
print("Result : ", conv(_2DT))

# Every thing worked as expected.

The weight matrix weights are  Parameter containing:
tensor([[[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]]], requires_grad=True)
Result :  tensor([[[ 15.,  25.,  65., 100.]],

        [[ 45.,  55.,  70.,  85.]]], grad_fn=<SqueezeBackward1>)


Max polling provides invariance to small translations of a given input. It reduce the size
of feature maps by using some function to summarize subregions, such as taking the average or the maximum value. Let test it out. First, lets take the maximum value. 

In [None]:
# kernel_size, the size of the sliding window.
# stride, the stride of the sliding window, must be > 0. Default value is kernel_size.
# padding, Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
nn.MaxPool1d(