In [2]:
import torch
from torch import nn
import numpy as np

## Explore

### Common Functions

In [4]:
# generate normal data
torch.normal(0, 0.2, (10,2,))

tensor([[ 0.0439,  0.2717],
        [-0.2739, -0.3007],
        [-0.0572, -0.3011],
        [ 0.2190,  0.1893],
        [-0.0176,  0.1996],
        [-0.0494, -0.0703],
        [-0.0410,  0.0712],
        [ 0.0198,  0.2211],
        [ 0.3262,  0.2361],
        [ 0.1248, -0.1574]])

In [14]:
# Slicing Tensors
X = torch.normal(0, 1, (3,2,1)) 
X[1, 1:2,], X[0, 1,]

(tensor([[-0.1294]]), tensor([0.0873]))

### Linear Layer
* It takes any (n1, n2, ..., nk, in_dim) tensor and maps it to (n1, n2, ..., nk, out_dim) array

In [5]:
in_dim, out_dim = 4, 2
linear_layer = nn.Linear(in_features=in_dim, out_features=out_dim, bias=False)

In [6]:
batch_size = 3
X = torch.rand(1, 2, batch_size, in_dim)
display(X)
display(X.shape)

tensor([[[[0.6020, 0.7538, 0.4929, 0.6953],
          [0.5153, 0.4116, 0.6765, 0.0383],
          [0.0749, 0.8834, 0.7991, 0.8148]],

         [[0.2680, 0.3155, 0.4103, 0.2258],
          [0.4591, 0.5276, 0.2865, 0.3702],
          [0.3893, 0.3579, 0.8499, 0.0464]]]])

torch.Size([1, 2, 3, 4])

In [7]:
linear_layer(X)

tensor([[[[-0.8013, -0.3974],
          [-0.5385, -0.1659],
          [-0.7853, -0.0920]],

         [[-0.3974, -0.0973],
          [-0.5067, -0.3278],
          [-0.5550, -0.0122]]]], grad_fn=<UnsafeViewBackward>)

### Softmax
Takes any tensor of shape (d1, d2, ..., dk, D) and returns a tensor of the same shape.
Softmax is applied along the last dimension.

In [13]:
X = torch.rand(1, 2, 3, 4, 5)
X = torch.rand(4, 3, 2)

In [14]:
X.shape

torch.Size([4, 3, 2])

In [15]:
nn.functional.softmax(X, dim=-1)

tensor([[[0.4340, 0.5660],
         [0.2812, 0.7188],
         [0.5198, 0.4802]],

        [[0.2769, 0.7231],
         [0.5045, 0.4955],
         [0.5857, 0.4143]],

        [[0.2973, 0.7027],
         [0.6497, 0.3503],
         [0.5015, 0.4985]],

        [[0.6732, 0.3268],
         [0.5544, 0.4456],
         [0.3603, 0.6397]]])

### Torch.repeat_interleave

* What do `torch.repeat_interleave` and `torch.tile` do? How are they different?
    - `torch.tile` is available only pytorch version >= 1.8
* How's it different from numpy.repeat and numpy.tile?
* numpy.repeat ~ torch.repeat_interleave
* toch.tile ~ torch.tile ( >= v1.8)
* np.tile ~ torch.repeat ( <= v1.7)

In [16]:
y = torch.tensor([[1, 2], [3, 4]])
y

tensor([[1, 2],
        [3, 4]])

In [17]:
torch.repeat_interleave(y, 3, dim=0)

tensor([[1, 2],
        [1, 2],
        [1, 2],
        [3, 4],
        [3, 4],
        [3, 4]])

In [18]:
# DON'T USE
# this is replaced by torch.tile or tensor.tile
y.repeat((2,2))

tensor([[1, 2, 1, 2],
        [3, 4, 3, 4],
        [1, 2, 1, 2],
        [3, 4, 3, 4]])

In [22]:
yNP = np.array([1,2])
yNP

array([1, 2])

In [23]:
np.repeat(yNP, 3)  # also yNP.repeat(3)

array([1, 1, 1, 2, 2, 2])

In [24]:
# np.repeat is similar to torch.repeat_interleave
np.repeat(yNP, [3, 7])

array([1, 1, 1, 2, 2, 2, 2, 2, 2, 2])

In [25]:
np.tile(yNP, (2,2))

array([[1, 2, 1, 2],
       [1, 2, 1, 2]])

### Unsqueeze and BMM 

In [27]:
N, M = 5, 7
keys = torch.randn(N, M)
values = torch.randn(N, M)

In [28]:
keys

tensor([[-0.8522,  0.3205,  0.9211, -0.6583,  0.4858, -1.7299, -1.1148],
        [-0.7381,  0.7560, -1.5419,  0.9316,  0.3500,  0.3181,  0.1209],
        [-1.8701, -0.7071, -1.8924, -0.1258, -0.2905, -0.0929,  0.1500],
        [ 1.4270,  0.5234, -0.7496,  0.6419,  0.6141,  0.2950,  0.8062],
        [ 0.3389, -0.8555, -2.3870, -1.8503,  0.9953,  1.0388,  2.7027]])

In [29]:
values

tensor([[ 5.2002e-01, -8.5105e-01, -3.6576e-01, -8.2473e-01, -7.8378e-01,
         -9.2171e-02,  9.3114e-01],
        [-2.4213e+00, -1.0258e+00,  1.2937e+00, -1.3050e+00,  1.4876e+00,
         -8.6324e-02, -8.6592e-01],
        [-1.8177e+00,  4.6732e-01, -1.6856e-01,  5.2153e-01,  6.0094e-01,
          7.8206e-01,  8.6765e-01],
        [ 1.3875e+00, -1.5491e-01, -9.0206e-01,  1.8910e-03, -1.8728e+00,
          8.4011e-01,  3.3396e-01],
        [-6.3718e-02,  1.3403e+00, -6.0417e-01,  1.9313e+00,  1.0785e+00,
          9.9741e-01, -8.9084e-01]])

We want to do the following

In [30]:
(keys*values).sum(dim=1)

tensor([-1.7692, -1.8104,  3.2051,  1.9434, -3.5976])

In [31]:
# unsqueeze adds and additional dimension of 1
# squeeze removes a dimension of 1
keys.shape, keys.unsqueeze(1).shape, keys.unsqueeze(1).squeeze().shape

(torch.Size([5, 7]), torch.Size([5, 1, 7]), torch.Size([5, 7]))

With batch multiplication there is a lot of reshaping

In [32]:
torch.bmm(keys.unsqueeze(1), values.unsqueeze(-1)).reshape(-1)

tensor([-1.7692, -1.8104,  3.2051,  1.9434, -3.5976])

With einsum it's elegant and simple

In [33]:
torch.einsum("ij, ij->i", values, keys)

tensor([-1.7692, -1.8104,  3.2051,  1.9434, -3.5976])