In [1]:
from einops import einsum   
import numpy as np
import torch
from typing import List

In [2]:
torch.arange(36).reshape(6, 6)

tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22, 23],
        [24, 25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34, 35]])

In [4]:
def make_block_diag(n:int, values:List):
    assert n % 2 == 0, "n must be even"
    assert len(values) == n // 2, "values must be of length n/2"

    blocks = [np.full((2, 2), val) for val in values]
    return np.block([
        [blocks[i] if i == j else np.zeros((2, 2)) for j in range(n // 2)]
        for i in range(n // 2)
    ])

In [5]:
make_block_diag(10, [i for i in range(5)])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 2., 2., 0., 0., 0., 0.],
       [0., 0., 0., 0., 2., 2., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 3., 3., 0., 0.],
       [0., 0., 0., 0., 0., 0., 3., 3., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 4., 4.],
       [0., 0., 0., 0., 0., 0., 0., 0., 4., 4.]])

In [6]:
def precompute_theta(head_dim: int, seq_len: int, device: str=None, theta:float=10000.0):
    assert head_dim%2==0
    theta_num=torch.arange(0, head_dim, 2).float()

    theta=1.0/(theta**(theta_num/head_dim)).to(device)
    m=torch.arange(seq_len)
    freqs=torch.outer(m, theta).float()
    
    return freqs

precompute_theta(head_dim=10, seq_len=2)[1]


tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04])

In [7]:
np.set_printoptions(linewidth=300, threshold=np.inf)
matrix=make_block_diag(10, precompute_theta(head_dim=10, seq_len=2)[1])
matrix
# for row in make_block_diag(10, precompute_theta(head_dim=10, seq_len=2)[1]):
#     print(row)

array([[1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.58489317e-01, 1.58489317e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.58489317e-01, 1.58489317e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.51188632e-02, 2.51188632e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.51188632e-02, 2.51188632e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.0

In [8]:
2*matrix

array([[2.00000000e+00, 2.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.00000000e+00, 2.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 3.16978633e-01, 3.16978633e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 3.16978633e-01, 3.16978633e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.02377264e-02, 5.02377264e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.02377264e-02, 5.02377264e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.0

In [9]:
import math
import numpy as np
matrix[range(10), range(10)]=np.cos(matrix[range(10), range(10)])
matrix

array([[5.40302306e-01, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 5.40302306e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.87466836e-01, 1.58489317e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.58489317e-01, 9.87466836e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.99684538e-01, 2.51188632e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.51188632e-02, 9.99684538e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.0

In [10]:
n_dim=10
a=[e for e in range(0,n_dim,2)]
b=[e for e in range(1,n_dim,2)]
for i in zip(a,b):
    print(i)

(0, 1)
(2, 3)
(4, 5)
(6, 7)
(8, 9)


In [11]:
matrix[a,b]
matrix[a,b]=-np.sin(matrix[a,b])

In [12]:
matrix

array([[ 5.40302306e-01, -8.41470985e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.00000000e+00,  5.40302306e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  9.87466836e-01, -1.57826638e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.58489317e-01,  9.87466836e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  9.99684538e-01, -2.51162218e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  2.51188632e-02,  9.99684538e

In [13]:
n_dim=10
c=[e for e in range(1,n_dim,2)]
d=[e for e in range(0,n_dim,2)]
for value in zip(c,d):
    print(value)

(1, 0)
(3, 2)
(5, 4)
(7, 6)
(9, 8)


In [14]:
matrix[c,d]=np.sin(matrix[c,d])
matrix

array([[ 5.40302306e-01, -8.41470985e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.41470985e-01,  5.40302306e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  9.87466836e-01, -1.57826638e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.57826638e-01,  9.87466836e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  9.99684538e-01, -2.51162218e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  2.51162218e-02,  9.99684538e

In [23]:
class myRotaryPositionalEmbedding():
    def __init__(self, theta:float, d_k:int, max_seq_len:int, device:str=None):
        self.theta=theta
        self.d_k=d_k
        self.max_seq_len=max_seq_len
        self.device=device
        self.ks=2*torch.arange(1, d_k//2+1).float()
        self.theta=1.0/(theta**(self.ks/d_k)).to(device)
        self.num_positions=torch.arange(max_seq_len)
        self.freqs=torch.outer(self.num_positions, self.theta).float()
        self.blocks=[torch.tensor(self.make_block_diag(n=d_k, values=self.freqs[i])) for i in range(len(self.freqs))]
        
        self.block_diagonal_matrix=torch.stack(self.blocks, dim=0)
        range_d_k=range(d_k)
        for matrix in self.block_diagonal_matrix:  matrix[range_d_k, range_d_k]=np.cos(matrix[range_d_k, range_d_k])

        a=[e for e in range(0,d_k,2)]
        b=[e for e in range(1,d_k,2)]
        for matrix in self.block_diagonal_matrix: matrix[a,b]=-np.sin(matrix[a,b])

        e=[e for e in range(1,d_k,2)]
        f=[e for e in range(0,d_k,2)]
        for matrix in self.block_diagonal_matrix: matrix[e,f]=np.sin(matrix[e,f])

        c.save_block_diagonal_matrix("printsdebug/block_diag_theta10000_dk20_seqlen10.pt")
    
    
    def save_block_diagonal_matrix(self, filepath: str):
        """
        Save the block diagonal matrix to a file using torch.save
        
        Args:
            filepath: Path where to save the matrix
        """
        torch.save(self.block_diagonal_matrix, filepath)


    def make_block_diag(self, n:int, values:List):
        assert n % 2 == 0, "n must be even"
        assert len(values) == n // 2, "values must be of length n/2"

        blocks = [np.full((2, 2), val) for val in values]
        return np.block([[blocks[i] if i == j else np.zeros((2, 2)) for j in range(n // 2)] for i in range(n // 2)])
       

    def forward(self, x:torch.Tensor, token_positions:torch.Tensor):
        """
        x: shape (batch_size, seq_len, d_k)
        tokens_positions: shape (batch_size, seq_len)
        """
        assert x.shape[-2]==token_positions.shape[-1], "x and tokens_positions must have the same sequence length"
        assert x.shape[-1]==self.d_k, "x must have the same number of columns as d_k"
        assert token_positions.max()<self.max_seq_len, "tokens_positions must be less than max_seq_len"

        rotated_vectors=einsum(x, self.block_diagonal_matrix,  "batch seq_len d_k, seq_len d_k d_k -> batch seq_len d_k")
        return rotated_vectors

In [79]:
torch.set_printoptions(precision=4)
torch.set_printoptions(linewidth=100000)

In [183]:
%reload_ext autoreload
%autoreload 2
from my_rope import myRotaryPositionalEmbedding


In [184]:
batch_size=4
n_queries=12
d_model=64
token_positions=torch.arange(0, n_queries)
torch.manual_seed(4)
in_embeddings=torch.randn(batch_size, n_queries, d_model, dtype=torch.float32)
observed=myRotaryPositionalEmbedding(theta=10000, d_k=d_model, max_seq_len=n_queries, device=None)

hola


In [185]:
import inspect
print(inspect.getsource(myRotaryPositionalEmbedding))

class myRotaryPositionalEmbedding(nn.Module):
    def __init__(self, theta:float, d_k:int, max_seq_len:int, device:str=None):
        super().__init__()
        print('hola')
        self.theta=theta
        self.d_k=d_k
        self.max_seq_len=max_seq_len
        self.device=device
        self.ks=2*torch.arange(0, d_k/2).float()
        self.one_over_theta=1.0/(theta**(self.ks/d_k)).to(device)
        self.num_positions=torch.arange(max_seq_len)
        self.angles=torch.outer(self.num_positions, self.one_over_theta).float()
        self.blocks=[torch.tensor(self.make_block_diag(n=d_k, values=self.angles[i])) for i in range(len(self.angles))]
        
        self.block_diagonal_matrix=torch.stack(self.blocks, dim=0)
        range_d_k=range(d_k)
        for matrix in self.block_diagonal_matrix:  matrix[range_d_k, range_d_k]=np.cos(matrix[range_d_k, range_d_k])

        a=[e for e in range(0,d_k,2)]
        b=[e for e in range(1,d_k,2)]
        for matrix in self.block_diagonal_matri

In [186]:
torch.set_printoptions(precision=4,linewidth=100000)
np.set_printoptions(linewidth=300, threshold=np.inf, precision=4)

# Load and print contents of .npz file
import numpy as np
import os
# Load the .npz file
expected = np.load(r'c:\Users\Andres.DESKTOP-D77KM25\Documents\assignment1-basics\tests\_snapshots\test_rope.npz')
expected=expected['array']
# # Print all the arrays in the file
# print("Arrays in the .npz file:")
# for key in expected.files:
#     print(f"\nArray name: {key}")
#     print(expected[key])


In [187]:
observedforward=observed.forward(x=in_embeddings, token_positions=token_positions)


In [188]:
print(expected.shape)
print(observedforward.shape)

(4, 12, 64)
torch.Size([4, 12, 64])


In [189]:
print(observed.block_diagonal_matrix[1].shape)
print(in_embeddings[0,1].shape)

torch.Size([64, 64])
torch.Size([64])


In [190]:
for batch in range(0,4):
    for tok_pos in range(0,12):
        comparison=np.allclose(expected[batch,tok_pos], observed.block_diagonal_matrix[tok_pos]@in_embeddings[batch,tok_pos], rtol=1e-5, atol=1e-8)
        if comparison:
            print(batch, tok_pos, "OK", expected[batch,tok_pos], "\n", observed.block_diagonal_matrix[tok_pos]@in_embeddings[batch,tok_pos])
        if not comparison:
            print(batch, tok_pos,"NOT OK",	expected[batch,tok_pos], "\n", observed.block_diagonal_matrix[tok_pos]@in_embeddings[batch,tok_pos] )
        print('\n')

0 0 OK [-0.9414  1.2632 -0.1838  0.1505  0.1075 -0.278  -2.6021  0.6245 -0.8684 -0.2051  0.3976  0.6699 -0.0537  0.0467 -1.7671 -2.1205  1.5191 -0.6682  0.0031 -0.1535  1.1396 -0.2302  1.1877  0.7677 -0.7588 -0.1853 -0.8558 -0.2346 -0.4215  0.8488 -0.6776 -0.9445 -0.4815  1.2434  2.3693  0.2829 -0.2345
  1.6892  0.2716 -0.1365 -0.6948 -1.3186 -0.9694  0.6403  0.8201 -0.9151 -2.1437  1.4072 -0.0263  2.7204 -0.5955  0.9871  1.0861  0.061   0.0417  0.6783 -0.8952 -1.0143 -0.2429 -1.5727  1.394  -0.1941  0.0048 -1.3165] 
 tensor([-0.9414,  1.2632, -0.1838,  0.1505,  0.1075, -0.2780, -2.6021,  0.6245, -0.8684, -0.2051,  0.3976,  0.6699, -0.0537,  0.0467, -1.7671, -2.1205,  1.5191, -0.6682,  0.0031, -0.1535,  1.1396, -0.2302,  1.1877,  0.7677, -0.7588, -0.1853, -0.8558, -0.2346, -0.4215,  0.8488, -0.6776, -0.9445, -0.4815,  1.2434,  2.3693,  0.2829, -0.2345,  1.6892,  0.2716, -0.1365, -0.6948, -1.3186, -0.9694,  0.6403,  0.8201, -0.9151, -2.1437,  1.4072, -0.0263,  2.7204, -0.5955,  0.9871, 

In [191]:
# Convert tensors to numpy arrays for element-wise comparison
for batch in range(0,4):
    for tok_pos in range(0,12):
        expected_np = expected[batch,tok_pos]
        observed_np = observed.block_diagonal_matrix[tok_pos]@in_embeddings[batch,tok_pos]
        observed_np=observed_np.numpy()

        # Calculate absolute differences
        abs_diff = np.abs(expected_np - observed_np)

        # Find indices where differences exceed tolerance
        tolerance = 1e-5
        diff_mask = abs_diff > tolerance
        diff_indices = np.where(diff_mask)

        # Print differences
        if len(diff_indices[0]) > 0:
            print(f"Found {len(diff_indices[0])} differences exceeding tolerance {tolerance}:")
            for i in range(len(diff_indices[0])):
                idx = tuple(dim[i] for dim in diff_indices)
                print(f"\nPosition {idx}:")
                print(f"Expected: {expected_np[idx]:.8f}")
                print(f"Observed: {observed_np[idx]:.8f}")
                print(f"Absolute difference: {abs_diff[idx]:.8f}")
        else:
            print("No differences found exceeding tolerance")


No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences found exceeding tolerance
No differences f

In [192]:
batch=1
tok_pos=11
print(observed.block_diagonal_matrix[tok_pos]@in_embeddings[batch,tok_pos])
print(expected[batch,tok_pos])
print(observedforward[batch,tok_pos])

tensor([-0.5721, -0.8464,  1.7298, -0.1258, -0.4639, -0.1365,  0.8258, -0.8532,  0.2417,  0.4189,  0.4127, -1.1398,  0.0350,  0.9064,  0.3114, -1.0474, -0.3789,  1.2010, -1.8642,  0.0030, -1.2934, -0.2406, -0.6246, -0.1201, -0.3377, -0.1415, -2.1240, -1.8255, -0.1678, -0.5337,  0.0711,  0.5797,  0.3517,  0.3302,  1.7418, -0.5356, -1.9819, -0.7246, -1.6991,  1.1723, -0.1447,  1.7485,  0.3765,  1.1016,  0.4646, -0.9489,  0.1967, -1.5416,  0.8429,  0.2660, -0.9534, -1.1004,  0.1667,  0.3337,  0.4235, -0.4065, -0.0131,  1.1603, -0.0853, -0.6048, -0.8750, -0.1789, -0.9855,  0.5126])
[-0.5721 -0.8464  1.7298 -0.1258 -0.4639 -0.1365  0.8258 -0.8532  0.2417  0.4189  0.4127 -1.1398  0.035   0.9064  0.3114 -1.0474 -0.3789  1.201  -1.8642  0.003  -1.2934 -0.2406 -0.6246 -0.1201 -0.3377 -0.1415 -2.124  -1.8255 -0.1678 -0.5337  0.0711  0.5797  0.3517  0.3302  1.7418 -0.5356 -1.9819
 -0.7246 -1.6991  1.1723 -0.1447  1.7485  0.3765  1.1016  0.4646 -0.9489  0.1967 -1.5416  0.8429  0.266  -0.9534 -1.10

In [193]:
# Convert torch tensor to numpy array for comparison
observed_np = observedforward.numpy()

# Compare all elements
comparison = np.allclose(expected, observed_np, rtol=1e-5, atol=1e-8)
print("\nAll elements match within tolerance:", comparison)

# If they don't match, print where they differ
if not comparison:
    # Find indices where elements differ beyond tolerance
    diff_mask = ~np.isclose(expected, observed_np, rtol=1e-5, atol=1e-8)
    diff_indices = np.where(diff_mask)
    
    print("\nDifferences found at:")
    for idx in zip(*diff_indices):
        print(f"Index {idx}:")
        print(f"Expected: {expected[idx]}")
        print(f"Observed: {observed_np[idx]}")
        print(f"Absolute difference: {abs(expected[idx] - observed_np[idx])}\n")



All elements match within tolerance: False

Differences found at:
Index (np.int64(1), np.int64(11), np.int64(19)):
Expected: 0.0030042529106140137
Observed: 0.003004312515258789
Absolute difference: 5.960464477539063e-08



In [196]:
batch=1
tok_pos=11
pos=19  
torch.set_printoptions(precision=8,linewidth=100000)
# np.set_printoptions(linewidth=300, threshold=np.inf, precision=4)
print(expected[batch,tok_pos, pos])
print(observedforward[batch,tok_pos, pos])

0.003004253
tensor(0.00300431)
