In [15]:
import torch
from torch.nn import functional as F

In [16]:
#
#  sum on the first dim, not keeping dimensionality
#

aa = torch.tensor(range(27)).reshape(3,3,3)
ss = torch.zeros(3,3)
print(aa.numpy(),'\n')
for j in range(3):
    for k in range(3):
        ss[j,k] = torch.sum(aa[:, j, k])
print(f"manually compute sum() on the first dimension:\n{ss.numpy()}\n")
print(f"torch.sum(,0):\n{torch.sum(aa, 0).numpy()}")


[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]] 

manually compute sum() on the first dimension:
[[27. 30. 33.]
 [36. 39. 42.]
 [45. 48. 51.]]

torch.sum(,0):
[[27 30 33]
 [36 39 42]
 [45 48 51]]


In [17]:
#
#  sum on the second dim, keeping dimensionality
#

aa = torch.tensor(range(27)).reshape(3,3,3)
ss = torch.zeros(3,3).reshape(3,1,3)
print(aa.numpy(),'\n')
for i in range(3):
    for k in range(3):
        ss[i,0,k] = torch.sum(aa[i,:, k])
print(f"manually compute sum() on the first dimension:\n{ss.numpy()}\n")
print(f"torch.sum(,1):\n{torch.sum(aa, 1, keepdim=True).numpy()}")

[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]] 

manually compute sum() on the first dimension:
[[[ 9. 12. 15.]]

 [[36. 39. 42.]]

 [[63. 66. 69.]]]

torch.sum(,1):
[[[ 9 12 15]]

 [[36 39 42]]

 [[63 66 69]]]


In [18]:
torch.manual_seed(42)

a = torch.tril(torch.ones(3, 3))
print(f"a=\n{a.numpy()}\n")
print(f"torch.sum(a, 1, keepdim=True).shape: {torch.sum(a, 1, keepdim=True).shape}")
print(f"torch.sum(a, 1, keepdim=True:\n{torch.sum(a, 1, keepdim=True)}")
print("This is sum over rows as expected\n")

a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
[[1. 0. 0.]
 [1. 1. 0.]
 [1. 1. 1.]]

torch.sum(a, 1, keepdim=True).shape: torch.Size([3, 1])
torch.sum(a, 1, keepdim=True:
tensor([[1.],
        [2.],
        [3.]])
This is sum over rows as expected

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [19]:
B,T,C = 4,4,4

wei = torch.tril(torch.ones(T,T))
wei_keepdim = wei/torch.sum(wei, dim=1, keepdim=True)
wei_no_keepdim = wei/torch.sum(wei, dim=1)

print("IMPORTANT: broadcasting at work!")
print(wei_keepdim, '\n')
div = torch.tensor([[1,1,1,1], [2,2,2,2], [3,3,3,3], [4,4,4,4]])
print(f"SAME as if div = \n{div}\n")
print(f"wei/div =\n{wei/div}\n")
div1=torch.tensor([[1,2,3,4]]).reshape(4,1)
print(f"SAME result if div1 = {div1}\n")
print(f"wei/div1 = {wei/div1}\n")
print(wei/div, '\n')
print(f"COMPARE: {wei_no_keepdim}\n")

print(f"torch.sum(wei, dim=1, keepdim=True) =\n{torch.sum(wei, dim=1, keepdim=True)}\n")
print(f"torch.sum(wei, dim=1, keepdim=True).shape = {torch.sum(wei, dim=1, keepdim=True).shape}\n")

print(f"torch.sum(wei, dim=1, keepdim=False) =\n{torch.sum(wei, dim=1, keepdim=False)}\n")
print(f"torch.sum(wei, dim=1, keepdim=False).shape = {torch.sum(wei, dim=1, keepdim=False).shape}\n")

IMPORTANT: broadcasting at work!
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]]) 

SAME as if div = 
tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4]])

wei/div =
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])

SAME result if div1 = tensor([[1],
        [2],
        [3],
        [4]])

wei/div1 = tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]]) 

COMPARE: tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [1.0000, 0.5000, 0.00

In [20]:
div = torch.tensor([[1,1,1,1], [2,2,2,2], [3,3,3,3], [4,4,4,4]])
print(wei/div)

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])


In [21]:
import numpy as np
from numpy import matrix

In [22]:
a=np.matrix([[1, 2], [3, 4]])
b=np.matrix([[1, 2], [3, 4]])

print(f"\nThis is the actual matrix multiplication: a*b = \n{a*b}\n")


This is the actual matrix multiplication: a*b = 
[[ 7 10]
 [15 22]]



In [23]:
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[1, 2], [3, 4]])
print(f"This is element-wise multiplication a.b =\n{(a*b).numpy()}\n")

This is element-wise multiplication a.b =
[[ 1  4]
 [ 9 16]]



In [24]:
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[1, 2], [3, 4]])
print(f"This is Pytorch matrix multiplication a@b =\n{(a@b).numpy()}\n")

This is Pytorch matrix multiplication a@b =
[[ 7 10]
 [15 22]]



In [25]:
B,T,C = 4,3,3

x = torch.randn(B,T,C)
print(f"x.shape = {x.shape}")

wei = torch.tril(torch.ones(T,T))
wei = wei/torch.sum(wei, 1, keepdim=True)
print(f"wei.shape = {wei.shape}")

x.shape = torch.Size([4, 3, 3])
wei.shape = torch.Size([3, 3])


In [26]:
br = torch.tensor([])
w = torch.unsqueeze(wei, dim=0)
for b in range(B):
    br = torch.cat((br,w),0)
print(f"broadcasted on the first argument wei shape: {br.shape}")

broadcasted on the first argument wei shape: torch.Size([4, 3, 3])


In [27]:
w1=wei @ x
w1.shape

torch.Size([4, 3, 3])

In [28]:
w2=w @ x
w2.shape

torch.Size([4, 3, 3])

In [29]:
print(f"FLOATING ERROR .... \nw1 - w2 =\n{w1 - w2}")

FLOATING ERROR .... 
w1 - w2 =
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 1.1921e-07, 1.3039e-08]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]]])


In [30]:
print(f"FLOATING ERROR .... \nw1 == w2:\n{w1 == w2}\n")
print(f"But they are close: {torch.allclose(w1,w2)}")

FLOATING ERROR .... 
w1 == w2:
tensor([[[ True,  True,  True],
         [ True,  True,  True],
         [ True, False, False]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]])

But they are close: True


In [31]:
x = torch.randint(100, (B,T,C))
wei = torch.randint(100, (T,T))
w = torch.unsqueeze(wei,0)
print(f"Works just fine with integer-valued tensors:\ntorch.equal(w @ x, wei @ x): {torch.equal(w @ x, wei @ x)}")

Works just fine with integer-valued tensors:
torch.equal(w @ x, wei @ x): True


In [32]:
#x = torch.tensor(range(12)).reshape(3,4).to(torch.float64)
x = torch.tril(torch.ones(3,4))
print(f"data:\n{x}\n")

print(f"F.softmax(x, dim=0) = \n{F.softmax(x, dim=0)}\n")

res = []
for j in range(x.shape[1]):
    c = x[:,j]
#     print(c)
#     print(F.softmax(c, dim=0).T, '\n')
    res.append(F.softmax(c, dim=0))

r = torch.stack(res, 1)
print(f"Same assembled per column (dim=0):\n{r}")

data:
tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.]])

F.softmax(x, dim=0) = 
tensor([[0.3333, 0.1554, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.5761, 0.3333]])

Same assembled per column (dim=0):
tensor([[0.3333, 0.1554, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.5761, 0.3333]])


In [33]:
#x = torch.tensor(range(12)).reshape(3,4).to(torch.float64)
x = torch.tril(torch.ones(3,4))
print(f"data:\n{x}\n")

print(f"F.softmax(x, dim=1) = \n{F.softmax(x, dim=1)}\n")

res = []
for i in range(x.shape[0]):
    c = x[i,:]
#     print(c)
#     print(F.softmax(c, dim=0).T, '\n')
    res.append(F.softmax(c, dim=0))

r = torch.stack(res, 0)
print(f"Same assembled per row (dim=1):\n{r}")

data:
tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.]])

F.softmax(x, dim=1) = 
tensor([[0.4754, 0.1749, 0.1749, 0.1749],
        [0.3655, 0.3655, 0.1345, 0.1345],
        [0.2969, 0.2969, 0.2969, 0.1092]])

Same assembled per row (dim=1):
tensor([[0.4754, 0.1749, 0.1749, 0.1749],
        [0.3655, 0.3655, 0.1345, 0.1345],
        [0.2969, 0.2969, 0.2969, 0.1092]])


In [34]:
t = torch.tensor(range(9)).reshape(3,3)
print(t)

indices = torch.tensor([0,2])

torch.index_select(t, 0, indices)

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])


tensor([[0, 1, 2],
        [6, 7, 8]])

In [35]:
t = torch.tensor(range(27)).reshape(3,3,3)
print(t)

indices = torch.tensor([0,2])

torch.index_select(t, 1, indices)

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])


tensor([[[ 0,  1,  2],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [15, 16, 17]],

        [[18, 19, 20],
         [24, 25, 26]]])

In [36]:
#
# sum: checking the obvious
#

a  = torch.tril(torch.ones(3,3))
a[2,2] = 10
print(f"original:\n{a}\n")


print(a[:,0])
print(a[:,1])
print(a[:,2],'\n')
print(f"sum(..,0):\n{torch.sum(a, 0)}\n\n----------------------\n")

print(f"sum(..,1):\n{torch.sum(a, 1)}\n")


original:
tensor([[ 1.,  0.,  0.],
        [ 1.,  1.,  0.],
        [ 1.,  1., 10.]])

tensor([1., 1., 1.])
tensor([0., 1., 1.])
tensor([ 0.,  0., 10.]) 

sum(..,0):
tensor([ 3.,  2., 10.])

----------------------

sum(..,1):
tensor([ 1.,  2., 12.])



In [37]:
#
# sum: checking the obvious
#

a  = torch.tril(torch.ones(3,3,3))
a[2,2,2] = 10
print(f"original:\n{a}\n")

print(a[:,0,0])
print(a[:,0,1])
print(a[:,0,2])
print(a[:,1,0])
print(a[:,1,1])
print(a[:,2,2])
print(f"sum(..,0):\n{torch.sum(a, 0)}\n\n----------------------\n")

print(f"sum(..,1):\n{torch.sum(a, 1)}\n")
print(f"sum(..,2):\n{torch.sum(a, 2)}\n")

original:
tensor([[[ 1.,  0.,  0.],
         [ 1.,  1.,  0.],
         [ 1.,  1.,  1.]],

        [[ 1.,  0.,  0.],
         [ 1.,  1.,  0.],
         [ 1.,  1.,  1.]],

        [[ 1.,  0.,  0.],
         [ 1.,  1.,  0.],
         [ 1.,  1., 10.]]])

tensor([1., 1., 1.])
tensor([0., 0., 0.])
tensor([0., 0., 0.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([ 1.,  1., 10.])
sum(..,0):
tensor([[ 3.,  0.,  0.],
        [ 3.,  3.,  0.],
        [ 3.,  3., 12.]])

----------------------

sum(..,1):
tensor([[ 3.,  2.,  1.],
        [ 3.,  2.,  1.],
        [ 3.,  2., 10.]])

sum(..,2):
tensor([[ 1.,  2.,  3.],
        [ 1.,  2.,  3.],
        [ 1.,  2., 12.]])



In [38]:
# Diagonal operation:
T=5
wei =torch.tril(torch.ones(T,T), diagonal=-1).to(torch.int16) + \
    torch.tril(torch.ones(T,T), diagonal=-1).mT.to(torch.int16)
print(f"wei =\n{wei}\n")
print(f"t =\n{t}\n")
print('\nFor example:\n')

t = torch.tensor(range(T**2)).reshape(T,T)

print(f"t*wei =\n{t*wei}")

wei =
tensor([[0, 1, 1, 1, 1],
        [1, 0, 1, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 1, 0, 1],
        [1, 1, 1, 1, 0]], dtype=torch.int16)

t =
tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])


For example:

t*wei =
tensor([[ 0,  1,  2,  3,  4],
        [ 5,  0,  7,  8,  9],
        [10, 11,  0, 13, 14],
        [15, 16, 17,  0, 19],
        [20, 21, 22, 23,  0]])


In [39]:
#
# perhaps this is faster as it doesn't need to be initialized in comparison with
# torch.zeros((2,2))
#
e = torch.empty((0,))
print(e.shape)
e1 = torch.empty((2,2))
print(e1.shape)
print(e1)

torch.Size([0])
torch.Size([2, 2])
tensor([[-6.1229e-27,  4.5789e-41],
        [ 5.2510e+01,  3.0777e-41]])


In [40]:
#
# Apparently broadcasting to more than 1 dimension works just fine
#    all that's required is that the data can be expanded without
#    copying the data
#

a=torch.ones(    8,1,2)
b=torch.ones(2,4,8,9,1)

(a+b).shape

torch.Size([2, 4, 8, 9, 2])

In [41]:
#
# Not sure what's the advantage of using masked_fill vs just [condition] operation:
#
T=5
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei[tril == 0] = float('-inf')
wei = torch.softmax(wei, dim=1)
print(wei)

print('\nVS\n')
tril = torch.tril(torch.ones(T,T))
wei1 = torch.zeros(T,T)
wei1 = wei.masked_fill(tril == 0, float('-inf'))
wei1 = torch.softmax(wei1, dim=1)
print(wei1)

print(f"\nwei == wei1: {torch.allclose(wei1, wei)}")

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])

VS

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])

wei == wei1: True


In [42]:
#
# Tuples are immutable, of course
#    some syntax picularities:
#
a=('a','b')
b=('c')
c=tuple(list(a)+list(b))
print(c)

# somehow things like
c=([a]+[b])
print(c)
# or even
c=(list(a)+list(b))
print(c)
#don't work

# it looks like one has to use 'list' and 'tuple'

('a', 'b', 'c')
[('a', 'b'), 'c']
['a', 'b', 'c']


In [43]:
print(*['a','b'])
print("".join(['a','b']))
print(['a','b'])

a b
ab
['a', 'b']


In [49]:
#
#  torch.sum() again (!!!)
#  ROW normalizaton in particular
#

t = torch.Tensor(range(4)).reshape(2,2)
print(t)
# add an empty column dim converting the result from a 1-d row to a 2-d column:
s1=torch.unsqueeze(torch.sum(t,dim=1), dim=1)
print(f"s1=\n{s1}")
print(f"VS.\ns1 =\n{torch.sum(t,dim=1)}\n")
print(t/s1)

#
#  produces the same result as
#

s1 = torch.sum(t, dim=1, keepdim=True)
print(f"s1=\n{s1}")
print(t/s1)

tensor([[0., 1.],
        [2., 3.]])
s1=
tensor([[1.],
        [5.]])
VS
s1 =
tensor([1., 5.])

tensor([[0.0000, 1.0000],
        [0.4000, 0.6000]])
s1=
tensor([[1.],
        [5.]])
tensor([[0.0000, 1.0000],
        [0.4000, 0.6000]])


In [59]:
#
# this is about broadcasting
#
print(f"t=\n{t}\n")
s1 = torch.sum(t, dim=1, keepdim=False) # keepdim is False by default
print(f"s1=\n{s1}\n")
print('shapes:\n  ', s1.shape)
print(t.shape)

# so tensors align as 
#     [1,2] and
#     [2,2]
# 
#  so after broadcasting 
#  s1 = [[x,y]
#        [x,y]]
#
#  and element-wise division results in 
#      the first column divided by x and second by y:
#
print(f"\nt/s1=\n{t/s1}")

t=
tensor([[0., 1.],
        [2., 3.]])

s1=
tensor([1., 5.])

shapes:
   torch.Size([2])
torch.Size([2, 2])

t/s1=
tensor([[0.0000, 0.2000],
        [2.0000, 0.6000]])
