In [2]:
import torch 

Convolutional Neural Networks (CNN) use the convolutional layer to extract features given an input. The output shape of the convolution is affected by the shape of the kernel, whether zero padding and strides is applied. This short tutorial illustrates such issues via pytorch examples.

**See**. V. Dumoulin and F. Visin, <i>A guide to convolution arithmetic for deep learning</i> <b>2016</b>. doi: 10.48550/ARXIV.1603.07285.

In [3]:
# First let’s create a fixed size inverted index. Each line corresponds to a words embedding
# 
# padding_idx is a reserved word that is used as wildcard when an unknown word occurs in text.
embedding = torch.nn.Embedding(10, 5, padding_idx=9)
embedding.weight

Parameter containing:
tensor([[ 0.2813, -0.7602,  0.4495,  1.2502, -0.9426],
        [ 0.0784, -1.4935, -0.2362, -0.3786,  0.9867],
        [-0.0498, -0.3142,  0.1057,  0.2097,  0.9843],
        [-0.6823, -0.4586, -0.1509,  0.4517, -0.6122],
        [ 0.7627,  0.1686, -0.9021,  0.4643, -0.2864],
        [-0.2242, -2.1846, -0.7038, -0.1789, -0.2455],
        [-0.4982, -1.6962,  1.6620, -0.2756,  0.9716],
        [ 1.0995,  1.8333, -0.3567,  0.6955, -0.5738],
        [-0.2587,  0.2063, -0.6878, -0.7413,  1.2303],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]], requires_grad=True)

In [4]:
# For debuggin purposes lets ,make 
for i in range(1,11):
    embedding.weight[i-1,:] = i
embedding.weight

Parameter containing:
tensor([[ 1.,  1.,  1.,  1.,  1.],
        [ 2.,  2.,  2.,  2.,  2.],
        [ 3.,  3.,  3.,  3.,  3.],
        [ 4.,  4.,  4.,  4.,  4.],
        [ 5.,  5.,  5.,  5.,  5.],
        [ 6.,  6.,  6.,  6.,  6.],
        [ 7.,  7.,  7.,  7.,  7.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 9.,  9.,  9.,  9.,  9.],
        [10., 10., 10., 10., 10.]], grad_fn=<CopySlices>)

**1D Convolution**

In [5]:
# Assume a sequence of words labeled as integers, i.e. [1,2,3]. The invertex
# index is used transform each integer to its vector form
_1DT = embedding(torch.tensor([1, 2, 3]))

# Estimate the transpose
_1DT = torch.transpose(embedding(torch.tensor([1, 2, 3])),0,1)
print(_1DT)

# The tensor shape
print(_1DT.shape)

tensor([[2., 3., 4.],
        [2., 3., 4.],
        [2., 3., 4.],
        [2., 3., 4.],
        [2., 3., 4.]], grad_fn=<TransposeBackward0>)
torch.Size([5, 3])


In [6]:
# 1-D Zero padding, unit stride (on one dimension), 1 input/output channel
# The convolution output is defined as o = ( i - k ) + 1

# let i be the text length
# Let d be the number of input channels. In our case, it corresponds to the embedding dimension.
# Let r be the number of output channels. For each channel a kernel of size h is created. Each with different coefficients
# Let h be the size of the kernel.
# Let s be the stride.

q = _1DT.shape[0]
r = 1
h = 2
s = 1

conv = torch.nn.Conv1d(in_channels=q, out_channels=r, kernel_size=h, stride=s,  bias=False)

# To make the convolution effect more visual. Bias terms are disabled from Conv1d and weight 
# coeficients are made 1 as follows
with torch.no_grad():
    conv.weight[:,:,:] = 1.

# 
conv(_1DT.unsqueeze(0))
    
# Shure enough the convolution produced the expected output

tensor([[[25., 35.]]], grad_fn=<SqueezeBackward1>)

In [30]:
# 1-D convolution
# in_channels (d), correspon to the embedding dimension.
# out_channels (r), the number of kernels that are applied.
# kernel_size (h)
# stride (s)


conv = torch.nn.Conv1d(in_channels=5, out_channels=1, kernel_size=2, stride=1,  bias=False)


#T_1D = torch.tensor([1, 2, 3, 4, 5, 6], dtype = torch.float)
#print(T_1D.shape)

# El tensor debe ser transformado a una matriz donde:
# - la primera columna corresponda al número de canales de entrada
# - la segunda columna corresponda al número de canales de salida
# - la tercera columna corresponda al tamaño de la señal 
T_1D = T_1D.unsqueeze(0).unsqueeze(0)
print(T_1D.shape)

# En este ejemplo, aplico un núcleo de tamaño 3. Hize que el número de salidas
# sea igual a 1. Si el número de canales de salida es 3, internamente se crean 
# tres kernels los cuales se aplican durante la fase de convolución.
conv = torch.nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=1,  bias=False)

# Para propósito de debug. Inicializo la matriz cero a unos. Si el número de 
# canales es mayor que 1. Entocnes es necesario inicializar las matrices de 
# coeficientes adicionales que se crean.
with torch.no_grad():
    conv.weight[0,:,:] = 1.

# Se imprime los coeficientes de la primera matriz.
print("The weight matrix weights are ", conv.weight)

# Se aplica la convolución y se imprime los coeficientes resultantes.
print("Result : ", conv(T_1D))

# Every thing worked as expected.

torch.Size([6])
torch.Size([1, 1, 6])
The weight matrix weights are  Parameter containing:
tensor([[[1., 1., 1.]]], requires_grad=True)
Result :  tensor([[[ 6.,  9., 12., 15.]]], grad_fn=<SqueezeBackward1>)


In [53]:
# 2-D tensor + convolution
T_2D = torch.tensor([
    [1,1,1,1,1],
    [2,2,2,2,2],
    [3,3,3,3,3],
    [4,4,4,4,4]
], dtype = torch.float)


# El tensor debe ser transformado a una matriz donde:
# - la primera columna corresponda al número de canales de entrada (tamaño del embedding)
# - la segunda columna corresponda al número de canales de salida (el número de núcleos)
# - la tercera columna corresponda al tamaño de la señal 
T_2D = T_2D.unsqueeze(0)
print("El contenido del tensor es : ", T_2D)
print("La dimensión del tensor es : ", T_2D.shape)

# En este ejemplo. Cada canal representa una palabra codificada en su forma vectorial
# Deseo un kernel bidimensional unitario con un stride de 1 
conv = torch.nn.Conv1d(in_channels=5, out_channels=1, kernel_size=5, stride=1,  bias=False)

# Para propósito de debug. Inicializo la matriz cero a unos. Si el número de 
# canales es mayor que 1. Entocnes es necesario inicializar las matrices de 
# coeficientes adicionales que se crean.
with torch.no_grad():
    conv.weight[:,:,:] = 1.

# Se imprime los coeficientes de la primera matriz.
print("The weight matrix weights are ", conv.weight)

# Se aplica la convolución y se imprime los coeficientes resultantes.
print("Result : ", conv(T_2D))

El contenido del tensor es :  tensor([[[1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3.],
         [4., 4., 4., 4., 4.]]])
La dimensión del tensor es :  torch.Size([1, 4, 5])
The weight matrix weights are  Parameter containing:
tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]], requires_grad=True)


RuntimeError: Given groups=1, weight of size 1 5 5, expected input[1, 4, 5] to have 5 channels, but got 4 channels instead

In [7]:
# 2-D tensor, 3 channels
T_2D = torch.tensor([  
    [[1,1,0,1,1],[2,2,0,2,2],[3,3,0,3,3],[4,4,0,4,4],[5,5,0,5,5]], 
    [[1,0,1,0,1],[2,0,2,0,2],[3,0,3,0,3],[4,0,4,0,4],[5,0,5,0,5]],
    [[0,1,0,1,0],[0,2,0,2,0],[0,3,0,3,0],[0,4,0,4,0],[0,5,0,5,0]]
], dtype = torch.float)
T_2D

tensor([[[1., 1., 0., 1., 1.],
         [2., 2., 0., 2., 2.],
         [3., 3., 0., 3., 3.],
         [4., 4., 0., 4., 4.],
         [5., 5., 0., 5., 5.]],

        [[1., 0., 1., 0., 1.],
         [2., 0., 2., 0., 2.],
         [3., 0., 3., 0., 3.],
         [4., 0., 4., 0., 4.],
         [5., 0., 5., 0., 5.]],

        [[0., 1., 0., 1., 0.],
         [0., 2., 0., 2., 0.],
         [0., 3., 0., 3., 0.],
         [0., 4., 0., 4., 0.],
         [0., 5., 0., 5., 0.]]])