In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#default_exp dl.utils.relative_positional
from nbdev.showdoc import show_doc

This notebook is about implementing relative positional encoding introduced in [Shaw et al (2018)](https://arxiv.org/pdf/1803.02155.pdf) and refined by [Huang et al (2018)](https://arxiv.org/pdf/1809.04281.pdf)

Shaw et al. originally introduce two learming parameters ($L^2 D$ size, where $L$ is seq len and $D$ is hidden dimension) to added in both keys and values when computing attentions.

$$
e_{ij} = \frac{x_i W^Q(x_jW^K + a_{ij}^K)^T}{\sqrt{d}} \tag{1}
$$

$$
\alpha_{ij} = \frac{e^{e_{ij}}}{\sum_{k=1}^{k=n}e^{e_{ik}}}
$$

And also to the values:
$
z_i = \sum_{j=1}^{j=n}\alpha_{ij}(x_j W^V + a_{ij}^V) \tag{2}
$

The equation 1 is equivalent to 

$$
\text{RelativeAttention} = \text{Softmax} \left( \frac{Q K^\top + S_{rel}}{\sqrt{D_h}} \right) V \tag{3}
$$

Where $R^T$ is the same as $A^K$ in equation 1
$$
Srel = Q R^T
$$

To address the memory concerns, Huang et al. proposed a skew algorithm by directly computing $S_{rel}$ not using intermediate $R$ at all, thus cut space to only position embedding space. While the idea of not direct computing $Srel$ is, I think the paper made a mistake in how to computing. I don't think it can be computed by adding one column, then shifting.

![Skew algorithm ](img/skew_alg.png)


Here is an illustration that paper was wrong.


In [6]:
import numpy as np

In [72]:
# L is seq len
L = 5

# Hidden dimsion
D = 1

Q = np.ones((L, D))
# Er is relative positional embeding
Er = np.array(range(-L+1, L))
Er = Er[:, np.newaxis]

In [73]:
Q,Er

(array([[1.],
        [1.],
        [1.],
        [1.],
        [1.]]),
 array([[-4],
        [-3],
        [-2],
        [-1],
        [ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4]]))

In [74]:
QEr = Q @ Er.T

In [75]:
QEr

array([[-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
       [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
       [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
       [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
       [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.]])

In [76]:
def naive_skew(QEr):
    rows, cols = QEr.shape
    seq_len = rows
    res = []
    for q in range(rows):
        rel_col_0 = 0 - q + seq_len - 1
        res.append(QEr[q, rel_col_0:rel_col_0+seq_len])
    return np.array(res)

In [77]:
ideal_sel = naive_skew(QEr)
ideal_sel

array([[ 0.,  1.,  2.,  3.,  4.],
       [-1.,  0.,  1.,  2.,  3.],
       [-2., -1.,  0.,  1.,  2.],
       [-3., -2., -1.,  0.,  1.],
       [-4., -3., -2., -1.,  0.]])

In [79]:
# here is illustration the paper was wrong
seq_len = QEr.shape[0]
QEr = Q @ Er[-seq_len:].T

# pad a dummy column vector of length L before the leftmost column
seq_len = QEr.shape[0]
QEr_pad = np.pad(QEr, ((0, 0), (1, 0)), constant_values=(-1, -1))
QEr_pad

array([[-1.,  0.,  1.,  2.,  3.,  4.],
       [-1.,  0.,  1.,  2.,  3.,  4.],
       [-1.,  0.,  1.,  2.,  3.,  4.],
       [-1.,  0.,  1.,  2.,  3.,  4.],
       [-1.,  0.,  1.,  2.,  3.,  4.]])

In [80]:
QEr_pad.shape

(5, 6)

In [81]:
#Reshape it to (seq_len+1, seq_len)
QEr_shaped = QEr_pad.reshape((seq_len+1, -1))
QEr_shaped

array([[-1.,  0.,  1.,  2.,  3.],
       [ 4., -1.,  0.,  1.,  2.],
       [ 3.,  4., -1.,  0.,  1.],
       [ 2.,  3.,  4., -1.,  0.],
       [ 1.,  2.,  3.,  4., -1.],
       [ 0.,  1.,  2.,  3.,  4.]])

In [70]:
Sel = QEr_shaped[1:]
Sel

array([[ 4., -1.,  0.,  1.,  2.],
       [ 3.,  4., -1.,  0.,  1.],
       [ 2.,  3.,  4., -1.,  0.],
       [ 1.,  2.,  3.,  4., -1.],
       [ 0.,  1.,  2.,  3.,  4.]])

In [84]:
ideal_sel

array([[ 0.,  1.,  2.,  3.,  4.],
       [-1.,  0.,  1.,  2.,  3.],
       [-2., -1.,  0.,  1.,  2.],
       [-3., -2., -1.,  0.,  1.],
       [-4., -3., -2., -1.,  0.]])

In [85]:
np.allclose(ideal_sel, Sel)

False

In [107]:
QEr = Q @ Er.T

def skew(QEr):
    seq_len = QEr.shape[0]
    q_ind = np.arange(seq_len)[:, None]
    k_ind = np.arange(seq_len)[None, :]
    col_ind = k_ind - q_ind + seq_len - 1
    return QEr[q_ind, col_ind]


In [108]:
sel = skew(QEr)
sel

array([[ 0.,  1.,  2.,  3.,  4.],
       [-1.,  0.,  1.,  2.,  3.],
       [-2., -1.,  0.,  1.,  2.],
       [-3., -2., -1.,  0.,  1.],
       [-4., -3., -2., -1.,  0.]])

In [111]:
np.allclose(ideal_sel, sel)

True

In [112]:
# torch version
import torch

In [117]:
# L is seq len
L = 5

# Hidden dimsion
D = 1

Q = torch.ones((L, D), dtype=torch.float32)
# Er is relative positional embeding
Er = torch.arange(-L+1, L, dtype=torch.float32).reshape((-1, D))
Q, Er

(tensor([[1.],
         [1.],
         [1.],
         [1.],
         [1.]]),
 tensor([[-4.],
         [-3.],
         [-2.],
         [-1.],
         [ 0.],
         [ 1.],
         [ 2.],
         [ 3.],
         [ 4.]]))

In [118]:
def skew(QEr):
    seq_len = QEr.shape[0]
    q_ind = torch.arange(seq_len)[:, None]
    k_ind = torch.arange(seq_len)[None, :]
    col_ind = k_ind - q_ind + seq_len - 1
    return QEr[q_ind, col_ind]

In [120]:
QEr = Q @ Er.T
QEr

tensor([[-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
        [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
        [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
        [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.],
        [-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.]])

In [121]:
skew(QEr)

tensor([[ 0.,  1.,  2.,  3.,  4.],
        [-1.,  0.,  1.,  2.,  3.],
        [-2., -1.,  0.,  1.,  2.],
        [-3., -2., -1.,  0.,  1.],
        [-4., -3., -2., -1.,  0.]])