# Rotary PE
This notebook is the implementation of [RoFormer]
https://arxiv.org/pdf/2104.09864v5

This implementation assumes d_model as embedding dimension for a single head

In [131]:
import tensorflow as tf
import torch
import numpy as np

In [132]:
block_size=4  #also called as context window or seq length
d_model=8      # embedding dimension
batch_size=4
num_heads=2

In [133]:
#calculating sinusoidal positition
positions=torch.arange(block_size)[:,np.newaxis] # adds dimension 
positions

tensor([[0],
        [1],
        [2],
        [3]])

In [134]:
# = {θi = 10000−2(i−1)/d, i ∈ [1, 2, ..., d/2]}. 
assert d_model%2==0
i=torch.arange(1,d_model//2+1)
exp_term=2*(i-1)/d_model
theta=100000**(-exp_term)

In [135]:
pos_emb=np.zeros([block_size,d_model])
pos_emb[:,0::2]=torch.cos(positions*theta)
pos_emb[:,1::2]=torch.sin(positions*theta)
pos_emb=pos_emb[np.newaxis,:,:]

In [136]:
pos_emb.shape

(1, 4, 8)

# Implementation of 3.4.2 from RoFormer

In [143]:

qw=torch.randn(batch_size,block_size,num_heads,d_model)#qw is token_embedding for single head
cosine_term=tf.repeat(pos_emb[...,None,0::2],repeats=2,axis=-1)# None is to add extra dimension for broadcasting 
sine_term=tf.repeat(pos_emb[...,None,1::2],repeats=2,axis=-1)
print(pos_emb)
print('cosine',cosine_term[:,1,:,:])
print('sine',sine_term[:,1,:,:])


[[[ 1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
    1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00]
  [ 5.40302277e-01  8.41470957e-01  9.98419285e-01  5.62044978e-02
    9.99994993e-01  3.16227227e-03  1.00000000e+00  1.77827940e-04]
  [-4.16146845e-01  9.09297407e-01  9.93682086e-01  1.12231314e-01
    9.99979973e-01  6.32451288e-03  9.99999940e-01  3.55655880e-04]
  [-9.89992499e-01  1.41120002e-01  9.85803485e-01  1.67903304e-01
    9.99954998e-01  9.48669016e-03  9.99999881e-01  5.33483806e-04]]]
cosine tf.Tensor(
[[[0.54030228 0.54030228 0.99841928 0.99841928 0.99999499 0.99999499
   1.         1.        ]]], shape=(1, 1, 8), dtype=float64)
sine tf.Tensor(
[[[8.41470957e-01 8.41470957e-01 5.62044978e-02 5.62044978e-02
   3.16227227e-03 3.16227227e-03 1.77827940e-04 1.77827940e-04]]], shape=(1, 1, 8), dtype=float64)


In [138]:
qw2=torch.stack([-qw[...,1::2],qw[...,0::2]],axis=-1)
qw2=qw2.view(qw.shape)

In [139]:
print(qw[1,1,1,:])# this is matrix of (x1,x2,..) which is multiplied to cosine_term
print(qw2[1,1,1,:])# this is matrix of (-x2,x1,..) which is multiplied to sine_term

tensor([ 0.3123, -0.2228,  1.2253, -0.1009, -0.6112, -0.5076,  0.2906,  0.5408])
tensor([ 0.2228,  0.3123,  0.1009,  1.2253,  0.5076, -0.6112, -0.5408,  0.2906])


In [142]:
qw=qw*cosine_term+qw2*sine_term