In [72]:
import torch
from torch.nn import functional as F

from torch import tanh, exp

import numpy as np

In [73]:
#
#  sum on the first dim, not keeping dimensionality
#

aa = torch.tensor(range(27)).reshape(3,3,3)
ss = torch.zeros(3,3)
print(aa.numpy(),'\n')
for j in range(3):
    for k in range(3):
        ss[j,k] = torch.sum(aa[:, j, k])
print(f"manually compute sum() on the first dimension:\n{ss.numpy()}\n")
print(f"torch.sum(,0):\n{torch.sum(aa, 0).numpy()}")


[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]] 

manually compute sum() on the first dimension:
[[27. 30. 33.]
 [36. 39. 42.]
 [45. 48. 51.]]

torch.sum(,0):
[[27 30 33]
 [36 39 42]
 [45 48 51]]


In [74]:
#
#  sum on the second dim, keeping dimensionality
#

aa = torch.tensor(range(27)).reshape(3,3,3)
ss = torch.zeros(3,3).reshape(3,1,3)
print(aa.numpy(),'\n')
for i in range(3):
    for k in range(3):
        ss[i,0,k] = torch.sum(aa[i,:, k])
print(f"manually compute sum() on the first dimension:\n{ss.numpy()}\n")
print(f"torch.sum(,1):\n{torch.sum(aa, 1, keepdim=True).numpy()}")

[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]] 

manually compute sum() on the first dimension:
[[[ 9. 12. 15.]]

 [[36. 39. 42.]]

 [[63. 66. 69.]]]

torch.sum(,1):
[[[ 9 12 15]]

 [[36 39 42]]

 [[63 66 69]]]


In [75]:
torch.manual_seed(42)

a = torch.tril(torch.ones(3, 3))
print(f"a=\n{a.numpy()}\n")
print(f"torch.sum(a, 1, keepdim=True).shape: {torch.sum(a, 1, keepdim=True).shape}")
print(f"torch.sum(a, 1, keepdim=True:\n{torch.sum(a, 1, keepdim=True)}")
print("This is sum over rows as expected\n")

a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
[[1. 0. 0.]
 [1. 1. 0.]
 [1. 1. 1.]]

torch.sum(a, 1, keepdim=True).shape: torch.Size([3, 1])
torch.sum(a, 1, keepdim=True:
tensor([[1.],
        [2.],
        [3.]])
This is sum over rows as expected

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [76]:
B,T,C = 4,4,4

wei = torch.tril(torch.ones(T,T))
wei_keepdim = wei/torch.sum(wei, dim=1, keepdim=True)
wei_no_keepdim = wei/torch.sum(wei, dim=1)

print("IMPORTANT: broadcasting at work!")
print(wei_keepdim, '\n')
div = torch.tensor([[1,1,1,1], [2,2,2,2], [3,3,3,3], [4,4,4,4]])
print(f"SAME as if div = \n{div}\n")
print(f"wei/div =\n{wei/div}\n")
div1=torch.tensor([[1,2,3,4]]).reshape(4,1)
print(f"SAME result if div1 = {div1}\n")
print(f"wei/div1 = {wei/div1}\n")
print(wei/div, '\n')
print(f"COMPARE: {wei_no_keepdim}\n")

print(f"torch.sum(wei, dim=1, keepdim=True) =\n{torch.sum(wei, dim=1, keepdim=True)}\n")
print(f"torch.sum(wei, dim=1, keepdim=True).shape = {torch.sum(wei, dim=1, keepdim=True).shape}\n")

print(f"torch.sum(wei, dim=1, keepdim=False) =\n{torch.sum(wei, dim=1, keepdim=False)}\n")
print(f"torch.sum(wei, dim=1, keepdim=False).shape = {torch.sum(wei, dim=1, keepdim=False).shape}\n")

IMPORTANT: broadcasting at work!
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]]) 

SAME as if div = 
tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4]])

wei/div =
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])

SAME result if div1 = tensor([[1],
        [2],
        [3],
        [4]])

wei/div1 = tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]]) 

COMPARE: tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [1.0000, 0.5000, 0.00

In [77]:
div = torch.tensor([[1,1,1,1], [2,2,2,2], [3,3,3,3], [4,4,4,4]])
print(wei/div)

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])


In [78]:
import numpy as np
from numpy import matrix

In [79]:
a=np.matrix([[1, 2], [3, 4]])
b=np.matrix([[1, 2], [3, 4]])

print(f"\nThis is the actual matrix multiplication: a*b = \n{a*b}\n")


This is the actual matrix multiplication: a*b = 
[[ 7 10]
 [15 22]]



In [80]:
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[1, 2], [3, 4]])
print(f"This is element-wise multiplication a.b =\n{(a*b).numpy()}\n")

This is element-wise multiplication a.b =
[[ 1  4]
 [ 9 16]]



In [81]:
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[1, 2], [3, 4]])
print(f"This is Pytorch matrix multiplication a@b =\n{(a@b).numpy()}\n")

This is Pytorch matrix multiplication a@b =
[[ 7 10]
 [15 22]]



In [82]:
B,T,C = 4,3,3

x = torch.randn(B,T,C)
print(f"x.shape = {x.shape}")

wei = torch.tril(torch.ones(T,T))
wei = wei/torch.sum(wei, 1, keepdim=True)
print(f"wei.shape = {wei.shape}")

x.shape = torch.Size([4, 3, 3])
wei.shape = torch.Size([3, 3])


In [83]:
br = torch.tensor([])
w = torch.unsqueeze(wei, dim=0)
for b in range(B):
    br = torch.cat((br,w),0)
print(f"broadcasted on the first argument wei shape: {br.shape}")

broadcasted on the first argument wei shape: torch.Size([4, 3, 3])


In [84]:
w1=wei @ x
w1.shape

torch.Size([4, 3, 3])

In [85]:
w2=w @ x
w2.shape

torch.Size([4, 3, 3])

In [86]:
print(f"FLOATING ERROR .... \nw1 - w2 =\n{w1 - w2}")

FLOATING ERROR .... 
w1 - w2 =
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 1.1921e-07, 1.3039e-08]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]]])


In [87]:
print(f"FLOATING ERROR .... \nw1 == w2:\n{w1 == w2}\n")
print(f"But they are close: {torch.allclose(w1,w2)}")

FLOATING ERROR .... 
w1 == w2:
tensor([[[ True,  True,  True],
         [ True,  True,  True],
         [ True, False, False]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]])

But they are close: True


In [88]:
x = torch.randint(100, (B,T,C))
wei = torch.randint(100, (T,T))
w = torch.unsqueeze(wei,0)
print(f"Works just fine with integer-valued tensors:\ntorch.equal(w @ x, wei @ x): {torch.equal(w @ x, wei @ x)}")

Works just fine with integer-valued tensors:
torch.equal(w @ x, wei @ x): True


In [89]:
#x = torch.tensor(range(12)).reshape(3,4).to(torch.float64)
x = torch.tril(torch.ones(3,4))
print(f"data:\n{x}\n")

print(f"F.softmax(x, dim=0) = \n{F.softmax(x, dim=0)}\n")

res = []
for j in range(x.shape[1]):
    c = x[:,j]
#     print(c)
#     print(F.softmax(c, dim=0).T, '\n')
    res.append(F.softmax(c, dim=0))

r = torch.stack(res, 1)
print(f"Same assembled per column (dim=0):\n{r}")

data:
tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.]])

F.softmax(x, dim=0) = 
tensor([[0.3333, 0.1554, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.5761, 0.3333]])

Same assembled per column (dim=0):
tensor([[0.3333, 0.1554, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.2119, 0.3333],
        [0.3333, 0.4223, 0.5761, 0.3333]])


In [90]:
#x = torch.tensor(range(12)).reshape(3,4).to(torch.float64)
x = torch.tril(torch.ones(3,4))
print(f"data:\n{x}\n")

print(f"F.softmax(x, dim=1) = \n{F.softmax(x, dim=1)}\n")

res = []
for i in range(x.shape[0]):
    c = x[i,:]
#     print(c)
#     print(F.softmax(c, dim=0).T, '\n')
    res.append(F.softmax(c, dim=0))

r = torch.stack(res, 0)
print(f"Same assembled per row (dim=1):\n{r}")

data:
tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.]])

F.softmax(x, dim=1) = 
tensor([[0.4754, 0.1749, 0.1749, 0.1749],
        [0.3655, 0.3655, 0.1345, 0.1345],
        [0.2969, 0.2969, 0.2969, 0.1092]])

Same assembled per row (dim=1):
tensor([[0.4754, 0.1749, 0.1749, 0.1749],
        [0.3655, 0.3655, 0.1345, 0.1345],
        [0.2969, 0.2969, 0.2969, 0.1092]])


In [91]:
t = torch.tensor(range(9)).reshape(3,3)
print(t,'\n')

indices = torch.tensor([0,2])

print(torch.index_select(t, 0, indices), '\n')
print(torch.index_select(t, 1, indices))

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]]) 

tensor([[0, 1, 2],
        [6, 7, 8]]) 

tensor([[0, 2],
        [3, 5],
        [6, 8]])


In [92]:
t = torch.tensor(range(27)).reshape(3,3,3)
print(t)

indices = torch.tensor([0,2])

torch.index_select(t, 1, indices)

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])


tensor([[[ 0,  1,  2],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [15, 16, 17]],

        [[18, 19, 20],
         [24, 25, 26]]])

In [93]:
#
# sum: checking the obvious
#

a  = torch.tril(torch.ones(3,3))
a[2,2] = 10
print(f"original:\n{a}\n")


print(a[:,0])
print(a[:,1])
print(a[:,2],'\n')
print(f"sum(..,0):\n{torch.sum(a, 0)}\n\n----------------------\n")

print(f"sum(..,1):\n{torch.sum(a, 1)}\n")


original:
tensor([[ 1.,  0.,  0.],
        [ 1.,  1.,  0.],
        [ 1.,  1., 10.]])

tensor([1., 1., 1.])
tensor([0., 1., 1.])
tensor([ 0.,  0., 10.]) 

sum(..,0):
tensor([ 3.,  2., 10.])

----------------------

sum(..,1):
tensor([ 1.,  2., 12.])



In [94]:
#
# sum: checking the obvious
#

a  = torch.tril(torch.ones(3,3,3))
a[2,2,2] = 10
print(f"original:\n{a}\n")

print(a[:,0,0])
print(a[:,0,1])
print(a[:,0,2])
print(a[:,1,0])
print(a[:,1,1])
print(a[:,2,2])
print(f"sum(..,0):\n{torch.sum(a, 0)}\n\n----------------------\n")

print(f"sum(..,1):\n{torch.sum(a, 1)}\n")
print(f"sum(..,2):\n{torch.sum(a, 2)}\n")

original:
tensor([[[ 1.,  0.,  0.],
         [ 1.,  1.,  0.],
         [ 1.,  1.,  1.]],

        [[ 1.,  0.,  0.],
         [ 1.,  1.,  0.],
         [ 1.,  1.,  1.]],

        [[ 1.,  0.,  0.],
         [ 1.,  1.,  0.],
         [ 1.,  1., 10.]]])

tensor([1., 1., 1.])
tensor([0., 0., 0.])
tensor([0., 0., 0.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([ 1.,  1., 10.])
sum(..,0):
tensor([[ 3.,  0.,  0.],
        [ 3.,  3.,  0.],
        [ 3.,  3., 12.]])

----------------------

sum(..,1):
tensor([[ 3.,  2.,  1.],
        [ 3.,  2.,  1.],
        [ 3.,  2., 10.]])

sum(..,2):
tensor([[ 1.,  2.,  3.],
        [ 1.,  2.,  3.],
        [ 1.,  2., 12.]])



In [95]:
# Diagonal operation:
T=5
wei =torch.tril(torch.ones(T,T), diagonal=-1).to(torch.int16) + \
    torch.tril(torch.ones(T,T), diagonal=-1).mT.to(torch.int16)
print(f"wei =\n{wei}\n")
print(f"t =\n{t}\n")
print('\nFor example:\n')

t = torch.tensor(range(T**2)).reshape(T,T)

print(f"t*wei =\n{t*wei}")

wei =
tensor([[0, 1, 1, 1, 1],
        [1, 0, 1, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 1, 0, 1],
        [1, 1, 1, 1, 0]], dtype=torch.int16)

t =
tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])


For example:

t*wei =
tensor([[ 0,  1,  2,  3,  4],
        [ 5,  0,  7,  8,  9],
        [10, 11,  0, 13, 14],
        [15, 16, 17,  0, 19],
        [20, 21, 22, 23,  0]])


In [96]:
#
# perhaps this is faster as it doesn't need to be initialized in comparison with
# torch.zeros((2,2))
#
e = torch.empty((0,))
print(e.shape)
e1 = torch.empty((2,2))
print(e1.shape)
print(e1)

torch.Size([0])
torch.Size([2, 2])
tensor([[5.6520e-02, 3.0730e-41],
        [5.7994e-02, 3.0730e-41]])


In [97]:
#
# Apparently broadcasting to more than 1 dimension works just fine
#    all that's required is that the data can be expanded without
#    copying the data
#

a=torch.ones(    8,1,2)
b=torch.ones(2,4,8,9,1)

(a+b).shape

torch.Size([2, 4, 8, 9, 2])

In [98]:
#
# Not sure what's the advantage of using masked_fill vs just [condition] operation:
#
T=5
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei[tril == 0] = float('-inf')
wei = torch.softmax(wei, dim=1)
print(wei)

print('\nVS\n')
tril = torch.tril(torch.ones(T,T))
wei1 = torch.zeros(T,T)
wei1 = wei.masked_fill(tril == 0, float('-inf'))
wei1 = torch.softmax(wei1, dim=1)
print(wei1)

print(f"\nwei == wei1: {torch.allclose(wei1, wei)}")

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])

VS

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])

wei == wei1: True


In [99]:
#
#  torch.sum() again (!!!)
#  ROW normalizaton in particular
#

t = torch.Tensor(range(4)).reshape(2,2)
print(f'\nt=\n{t}\n')
print('\n==================================\n')

# add an empty column dim converting the result from a 1-d row to a 2-d column:
s1=torch.unsqueeze(torch.sum(t,dim=1), dim=1)
print(f"For unsqueeze s{s1.shape=}\n")
print(f"s1=\n{s1}\n")
print('\n==================================\n')

s1=torch.sum(t,dim=1)
print(f"For keepdim=False, {s1.shape=}\n")
print(f"s1 =\n{s1}\n")
print(f"t/s1 =\n{t/s1}\n")

#
#  produces the same result as
#
print('\n==================================\n')
s1 = torch.sum(t, dim=1, keepdim=True)
print(f"For keepdim=True, {s1.shape=}\n")
print(f"s1=\n{s1}\n")
print(f"t/s1 =\n{t/s1}")


t=
tensor([[0., 1.],
        [2., 3.]])



For unsqueeze ss1.shape=torch.Size([2, 1])

s1=
tensor([[1.],
        [5.]])



For keepdim=False, s1.shape=torch.Size([2])

s1 =
tensor([1., 5.])

t/s1 =
tensor([[0.0000, 0.2000],
        [2.0000, 0.6000]])



For keepdim=True, s1.shape=torch.Size([2, 1])

s1=
tensor([[1.],
        [5.]])

t/s1 =
tensor([[0.0000, 1.0000],
        [0.4000, 0.6000]])


In [100]:
#
# this is about broadcasting
#
print(f"t=\n{t}\n")
s1 = torch.sum(t, dim=1, keepdim=False) # keepdim is False by default
print(f"s1=\n{s1}\n")
print('shapes:\n  ', s1.shape)
print(t.shape)
#
# so tensors align as 
#     [2,1] and
#     [2,2]
# 
#  so after broadcasting 
#
#  if s1 = [[x],
#           [y]]
#
#  it actually can be thought of as:
#
#  s1 = [[x,x]
#        [x,y]]
#
#  and element-wise division results in 
#      the row column divided by x and second by y:
#
print(f"\nt/s1=\n{t/s1}")
#

t=
tensor([[0., 1.],
        [2., 3.]])

s1=
tensor([1., 5.])

shapes:
   torch.Size([2])
torch.Size([2, 2])

t/s1=
tensor([[0.0000, 0.2000],
        [2.0000, 0.6000]])


In [101]:
# RN generator
g = torch.Generator().manual_seed(2147483647)
print(f"torch.Generator().manual_seed(2147483647): {torch.randn(3, generator=g)}")

# same as
torch.manual_seed(2147483647)
print(f"torch.manual_seed(2147483647):             {torch.randn(3)}")

# but generator allows for parallel RN generation:
g1 = torch.Generator().manual_seed(2147483647)
g2 = torch.Generator().manual_seed(2147483647)
print(f"\ntorch.randn(3, generator=g1): {torch.randn(3, generator=g1)}")
print(f"torch.randn(3, generator=g2): {torch.randn(3, generator=g2)}")

torch.Generator().manual_seed(2147483647): tensor([-0.9800, -1.6578, -0.0572])
torch.manual_seed(2147483647):             tensor([-0.9800, -1.6578, -0.0572])

torch.randn(3, generator=g1): tensor([-0.9800, -1.6578, -0.0572])
torch.randn(3, generator=g2): tensor([-0.9800, -1.6578, -0.0572])


In [102]:
#
# used in model smoothing:
#

N = torch.Tensor(range(4)).reshape(2,2)
print(N)
print(N+1) #elementwise addition to avoid log(p) becoming -inf

tensor([[0., 1.],
        [2., 3.]])
tensor([[1., 2.],
        [3., 4.]])


In [103]:
x = torch.tensor([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])
x_ind = [range(3)]
y_ind = [0,2,1]
print(f"x[[range(3)], [0,2,1]] = {x[x_ind, y_ind]}")
x[:, [0,2,1]]

x[[range(3)], [0,2,1]] = tensor([[1, 6, 8]])


tensor([[1, 3, 2],
        [4, 6, 5],
        [7, 9, 8]])

In [104]:
D = torch.tensor(range(16)).reshape(8,2)

print(f"{D[torch.tensor([4*[1,2,3]])]}\n")
print("x-ind and y-ind need to be the same length:")
print(f"D[range(1,3),range(0,2)] = {D[range(1,3),range(0,2)]} <--- \n")
print("This doesn't work: D[range(1,4),range(0,2)]")
#print(f"{D[range(1,4),range(0,2)]}")
print("\nBut this does:")
print(f"D[1:5, :] = \n{D[1:5,:]}\n")
print(f"D[1:5,range(0,2)] = \n{D[1:5,range(0,2)]}\n")
#torch.tensor([range(1,6), range(0,2)])

tensor([[[2, 3],
         [4, 5],
         [6, 7],
         [2, 3],
         [4, 5],
         [6, 7],
         [2, 3],
         [4, 5],
         [6, 7],
         [2, 3],
         [4, 5],
         [6, 7]]])

x-ind and y-ind need to be the same length:
D[range(1,3),range(0,2)] = tensor([2, 5]) <--- 

This doesn't work: D[range(1,4),range(0,2)]

But this does:
D[1:5, :] = 
tensor([[2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

D[1:5,range(0,2)] = 
tensor([[2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])



In [105]:
print(f"{D.shape = }")
print(f"torch.tensor(([[0,1],[1,0]])).shape = {torch.tensor(([[0,1],[1,0]])).shape}")
print("\n========================================\n")

print(f"D[[[0,1],[1,0]]].shape = {D[[[0,1],[1,0]]].shape}")
print(f"D[torch.tensor(([[0,1],[1,0]]))].shape = {D[torch.tensor(([0,1],[1,0]))].shape}")
print(f"D[torch.tensor([3*[0,1]])].shape = {D[torch.tensor([3*[0,1]])].shape}")
print(f"D[torch.tensor([3*[[0,1]]])].shape = {D[torch.tensor([3*[[0,1]]])].shape}")
print(f"D[torch.tensor([3*[0,1]])].shape = {D[torch.tensor([3*[0,1]])].shape}")
print("\n========================================\n")

print(f"D[torch.tensor(([[0,1],[1,0]]))] =\n{D[torch.tensor(([0,1],[1,0]))]}\n")
print(f"D[torch.tensor([3*[0,1]])] =\n{D[torch.tensor([3*[0,1]])]}\n")

print("\n=====================\n")
print(f"D[torch.tensor([3*[[0,1]]])] =\n{D[torch.tensor([3*[[0,1]]])]}\n")
print(f"{[3*[[0,1]]] = }")
print("\n=====================\n")

print(f"D[torch.tensor([3*[0,1]])] =\n{D[torch.tensor([3*[0,1]])]}\n")
print("\n========================================\n")

print(f"torch.tensor(([[0,1],[1,0]])) =\n{torch.tensor(([0,1],[1,0]))}\n")
print(f"D[torch.tensor(([[0,1],[1,0]]))] =\n{D[torch.tensor(([0,1],[1,0]))]}\n")
print(f"torch.tensor(([[0,1])) =\n{torch.tensor(([0,1]))}\n")
#print(f"torch.tensor(([[0,1])).shape =\n{torch.tensor(([0,1])).shape}\n")
print(f"D[torch.tensor(([[0,1]))] =\n{D[torch.tensor(([0,1]))]}\n")
print("\n========================================\n")


print(f"D[[[0,1],[1,0]]] =\n{D[[[0,1],[1,0]]]}\n")
print(f"D[[[0,1]]] =\n{D[[[0,1]]]}\n")
print(f"D[[0,1]] =\n{D[[0,1]]}\n")
print(f"D[0,1] = {D[0,1]}")

D.shape = torch.Size([8, 2])
torch.tensor(([[0,1],[1,0]])).shape = torch.Size([2, 2])


D[[[0,1],[1,0]]].shape = torch.Size([2])
D[torch.tensor(([[0,1],[1,0]]))].shape = torch.Size([2, 2, 2])
D[torch.tensor([3*[0,1]])].shape = torch.Size([1, 6, 2])
D[torch.tensor([3*[[0,1]]])].shape = torch.Size([1, 3, 2, 2])
D[torch.tensor([3*[0,1]])].shape = torch.Size([1, 6, 2])


D[torch.tensor(([[0,1],[1,0]]))] =
tensor([[[0, 1],
         [2, 3]],

        [[2, 3],
         [0, 1]]])

D[torch.tensor([3*[0,1]])] =
tensor([[[0, 1],
         [2, 3],
         [0, 1],
         [2, 3],
         [0, 1],
         [2, 3]]])



D[torch.tensor([3*[[0,1]]])] =
tensor([[[[0, 1],
          [2, 3]],

         [[0, 1],
          [2, 3]],

         [[0, 1],
          [2, 3]]]])

[3*[[0,1]]] = [[[0, 1], [0, 1], [0, 1]]]


D[torch.tensor([3*[0,1]])] =
tensor([[[0, 1],
         [2, 3],
         [0, 1],
         [2, 3],
         [0, 1],
         [2, 3]]])



torch.tensor(([[0,1],[1,0]])) =
tensor([[0, 1],
        [1, 

In [106]:
DDD = torch.tensor(range(27)).reshape(3,3,3)
print(DDD, '\n')
print(DDD[1,1,1])
print(DDD[1])
print(DDD[1,1,])
print(DDD[1,1],'\n')
print(DDD[1,1,:])
print(DDD[:,1,1])


tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]]) 

tensor(13)
tensor([[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]])
tensor([12, 13, 14])
tensor([12, 13, 14]) 

tensor([12, 13, 14])
tensor([ 4, 13, 22])


In [107]:
D1 = D.view(2,-1)
print(D1)
ind = torch.tensor(range(3))
print(f"{D1[:,ind] = }")
print(f"{D[ind] = }")
print(f"{D[range(3)] = }")

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12, 13, 14, 15]])
D1[:,ind] = tensor([[ 0,  1,  2],
        [ 8,  9, 10]])
D[ind] = tensor([[0, 1],
        [2, 3],
        [4, 5]])
D[range(3)] = tensor([[0, 1],
        [2, 3],
        [4, 5]])


In [108]:
C=torch.tensor(range(54)).reshape(27,2)
X=torch.tensor(range(300)).reshape(100,3)

print(f"\n{X.shape = }")
print(f"{C.shape = }")

print(f"{X[0].shape = }\n\n")
print(f"{X[0] = }\n")
print("C[C[:,0]<0,0] = \n{C[C[:,0]<0,0]}\n")
print(f"C[X[0]] =\n{C[X[0]]}")
print(f"C[[[1,2,3,4], [1]]] =\n{C[[[1,2,3,4], [1]]]}\n")
print(f"C[[[1,2,3,4]]] =\n{C[[[1,2,3,4]]]}\n")
print(f"C[[range(1,5)], [range(1,2)]] = \n{C[[range(1,5)], [range(1,2)]]}\n")
print(f"C[1:5, range(1,2)] = \n{C[1:5,range(1,2)]}\n")
print(f"C[1:5, [range(1,2)]] = \n{C[1:5, [range(1,2)]]}\n")
print(f"C[1:5, 1:2] = \n{C[1:5, 1:2]}\n")

print(f"{C[torch.tensor([[1,2,3],[4,5,6]])] = }")
print(f"{C[X[1,2]] = }")


X.shape = torch.Size([100, 3])
C.shape = torch.Size([27, 2])
X[0].shape = torch.Size([3])


X[0] = tensor([0, 1, 2])

C[C[:,0]<0,0] = 
{C[C[:,0]<0,0]}

C[X[0]] =
tensor([[0, 1],
        [2, 3],
        [4, 5]])
C[[[1,2,3,4], [1]]] =
tensor([3, 5, 7, 9])

C[[[1,2,3,4]]] =
tensor([[2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

C[[range(1,5)], [range(1,2)]] = 
tensor([[3, 5, 7, 9]])

C[1:5, range(1,2)] = 
tensor([[3],
        [5],
        [7],
        [9]])

C[1:5, [range(1,2)]] = 
tensor([[[3]],

        [[5]],

        [[7]],

        [[9]]])

C[1:5, 1:2] = 
tensor([[3],
        [5],
        [7],
        [9]])

C[torch.tensor([[1,2,3],[4,5,6]])] = tensor([[[ 2,  3],
         [ 4,  5],
         [ 6,  7]],

        [[ 8,  9],
         [10, 11],
         [12, 13]]])
C[X[1,2]] = tensor([10, 11])


In [109]:
print(f"{X[[1,2,3],:] = }\n")
print(f"{X[[1,2,3]] = }\n")
print(f"{X[torch.tensor([1,2,3]),:] = }")


print("\n====================================\n")
print(f"\n{X[[1,2,3],0:2] = }\n")
print(f"{X[torch.tensor([1,2,3]),0:2] = }\n")

# not clear, this wants broadcasting:
# --> IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [3], [2]
#print(f"{X[torch.tensor([1,2,3]),torch.tensor(range(2))] = }")

print(f"{torch.tensor(range(2)).shape = }")

X[[1,2,3],:] = tensor([[ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])

X[[1,2,3]] = tensor([[ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])

X[torch.tensor([1,2,3]),:] = tensor([[ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])



X[[1,2,3],0:2] = tensor([[ 3,  4],
        [ 6,  7],
        [ 9, 10]])

X[torch.tensor([1,2,3]),0:2] = tensor([[ 3,  4],
        [ 6,  7],
        [ 9, 10]])

torch.tensor(range(2)).shape = torch.Size([2])


In [110]:
X = torch.tensor(range(9)).reshape(3,3).float()
O = torch.ones(3,3)
O_str = torch.ones(3)
print(f"{X = }")
print(f"{O = }")
print(f"{O_str = }\n")

X = tensor([[0., 1., 2.],
        [3., 4., 5.],
        [6., 7., 8.]])
O = tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
O_str = tensor([1., 1., 1.])



In [111]:
print(f"{X+O = }")
print(f"{X+O_str = }")
print(f"{torch.equal(X + O, X + O_str) = }")
print(f"{X + torch.ones(3,3) = }")
print(f"{torch.equal(X+O, X + torch.ones(3,3)) = }")

X+O = tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
X+O_str = tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
torch.equal(X + O, X + O_str) = True
X + torch.ones(3,3) = tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
torch.equal(X+O, X + torch.ones(3,3)) = True


In [112]:
#
# this works but looks bad:
#
X1 = torch.stack(*[[*torch.unbind(X, dim=1)] + [torch.ones(3)]], dim=1)
O1 = torch.stack(*[[*torch.unbind(torch.eye(3), dim=1)] + [torch.ones(3)]], dim=0)
print(f"{X1 = }")
print(f"{O1 = }")

X1 = tensor([[0., 1., 2., 1.],
        [3., 4., 5., 1.],
        [6., 7., 8., 1.]])
O1 = tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 1., 1.]])


In [113]:
#
# a variant with constant padding:
#
E = torch.eye(3)
print(E)
f10 = torch.nn.ConstantPad1d((1,0), 5)
f01 = torch.nn.ConstantPad1d((0,1), 5)
print(f10(E))
print(f01(E))

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])
tensor([[5., 1., 0., 0.],
        [5., 0., 1., 0.],
        [5., 0., 0., 1.]])
tensor([[1., 0., 0., 5.],
        [0., 1., 0., 5.],
        [0., 0., 1., 5.]])


In [114]:
#
# this looks much better, just add a fake dimension to ones making them 1x3 on creation
#
X1 = torch.cat([X, torch.ones(3,1)], dim=1)
print(f"{X1 = }")

X1 = tensor([[0., 1., 2., 1.],
        [3., 4., 5., 1.],
        [6., 7., 8., 1.]])


In [115]:
#
# this also looks much better, just add a fake dimension to ones making them 1x3 using unsqueeze
#
OU = torch.unsqueeze(torch.ones(3), 0)
print(f"{OU.shape = }")
X1 = torch.cat([X, OU], dim=0)
print(f"{X1 = }")

OU = torch.unsqueeze(torch.ones(3), 1)
print(f"{OU.shape = }")
X1 = torch.cat([X, OU], dim=1)
print(f"{X1 = }")

OU.shape = torch.Size([1, 3])
X1 = tensor([[0., 1., 2.],
        [3., 4., 5.],
        [6., 7., 8.],
        [1., 1., 1.]])
OU.shape = torch.Size([3, 1])
X1 = tensor([[0., 1., 2., 1.],
        [3., 4., 5., 1.],
        [6., 7., 8., 1.]])


In [116]:
print(f"X:\n{X.numpy()}\n")
print(f"O:\n{O.numpy()}\n")
print(f"X1:\n{X1.numpy()}\n")
print(f"O1:\n{O1.numpy()}\n")
print(f"X1 @ O1:\n{(X1 @ O1).numpy()}\n")
print(f"X+O:\n{(X+O).numpy()}\n")
print(torch.equal(X1 @ O1, X + O))

X:
[[0. 1. 2.]
 [3. 4. 5.]
 [6. 7. 8.]]

O:
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

X1:
[[0. 1. 2. 1.]
 [3. 4. 5. 1.]
 [6. 7. 8. 1.]]

O1:
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 1. 1.]]

X1 @ O1:
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]

X+O:
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]

True


In [117]:
R = torch.tensor(range(1,4)).float()
X1 = torch.cat((X, torch.unsqueeze(torch.ones(3), dim=1)), dim=1)
print('X1:\n', X1.numpy(), '\n', sep='')

M = torch.cat((torch.eye(3), torch.unsqueeze(R, dim=0)), dim=0)
print(M.numpy(), '\n')

X1 @ M

M1 = torch.cat((torch.zeros(3,3), torch.unsqueeze(R, dim=0)), dim=0)
print(M1.numpy(), '\n')

X1 @ M1

X1:
[[0. 1. 2. 1.]
 [3. 4. 5. 1.]
 [6. 7. 8. 1.]]

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 2. 3.]] 

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [1. 2. 3.]] 



tensor([[1., 2., 3.],
        [1., 2., 3.],
        [1., 2., 3.]])

In [118]:
torch.unsqueeze(R, 1)

tensor([[1.],
        [2.],
        [3.]])

In [119]:
#
# duplicate a row:
#

print(torch.zeros(3,3)+R)

#
# duplicate a column:
#

print(torch.zeros(3,3) + torch.unsqueeze(R, dim=1))

tensor([[1., 2., 3.],
        [1., 2., 3.],
        [1., 2., 3.]])
tensor([[1., 1., 1.],
        [2., 2., 2.],
        [3., 3., 3.]])


In [120]:
T1 = torch.arange(16).reshape(4,4)
#
#  so much for this: :)
#
T2 = torch.tensor(range(16)).reshape(4,4)
torch.equal(T1,T2)

True

In [121]:
print(T1)
print(T1[range(4),range(4)])

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15]])
tensor([ 0,  5, 10, 15])


In [122]:
T3=torch.arange(27).reshape(3,3,3)
print(T3)
print(T3[range(3), range(3), range(3)])

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])
tensor([ 0, 13, 26])


In [123]:
T2 = torch.tensor(range(16)).reshape(4,4)
print(f"{T2.numpy()}\n")
Center = T2[:, torch.tensor([1,2])][torch.tensor([1,2]),:]
print(f"T2[:, torch.tensor([1,2])][torch.tensor([1,2]),:] =\n{Center.numpy()}\n", sep='')


[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]

T2[:, torch.tensor([1,2])][torch.tensor([1,2]),:] =
[[ 5  6]
 [ 9 10]]



In [124]:
#
# Because of broadcasting, addition isn't commutative in pytorch :) 
#
T2 + torch.arange(4) - T2

tensor([[0, 1, 2, 3],
        [0, 1, 2, 3],
        [0, 1, 2, 3],
        [0, 1, 2, 3]])

In [125]:
#
#  division isn't commutative either:
#
T2F = T2.float()
print(f"{torch.equal(T2F/np.exp(1)*np.exp(1), T2F) = }")
print(f"{torch.allclose(T2F/np.exp(1)*np.exp(1), T2F) = }")

torch.equal(T2F/np.exp(1)*np.exp(1), T2F) = False
torch.allclose(T2F/np.exp(1)*np.exp(1), T2F) = True


In [126]:
#
#   logits seem to be interpreted so that exp(logit) ~ "count", which is 
#          strange as exp(logit) quickly approaches "e"
#
for i in range(10):
    logit = tanh(torch.tensor(i))
    print(exp(logit))

tensor(1.)
tensor(2.1417)
tensor(2.6222)
tensor(2.7049)
tensor(2.7165)
tensor(2.7180)
tensor(2.7182)
tensor(2.7183)
tensor(2.7183)
tensor(2.7183)


In [127]:
logits = torch.ones(10).float()
counts = logits.exp()

print(counts)
print(logits)
loss = F.cross_entropy(logits, logits)
print(f"{loss = }")

tensor([2.7183, 2.7183, 2.7183, 2.7183, 2.7183, 2.7183, 2.7183, 2.7183, 2.7183,
        2.7183])
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
loss = tensor(23.0259)


In [128]:
n_samp = 3

for mult in range(1,10):
    
    # 0-th column is 1, while the rest become more and more negative
    logits = -mult*torch.ones(n_samp,5)
    logits[:,0] = 1
    
    print(f"logits =\n{logits.numpy()}\n", sep='')
    prob = F.softmax(logits, dim=1)
    prob = torch.round(F.softmax(logits, dim=1), decimals = 2)
    print(f"prob = \n{prob.numpy()}\n", sep='')
    g_truth = torch.zeros(n_samp).to(torch.int64)
    print(f"ground truth: {g_truth.numpy()}")
    loss = F.cross_entropy(logits, g_truth)
    print(f"loss = {loss.numpy():.5f}\n\n======================\n")

logits =
[[ 1. -1. -1. -1. -1.]
 [ 1. -1. -1. -1. -1.]
 [ 1. -1. -1. -1. -1.]]

prob = 
[[0.65 0.09 0.09 0.09 0.09]
 [0.65 0.09 0.09 0.09 0.09]
 [0.65 0.09 0.09 0.09 0.09]]

ground truth: [0 0 0]
loss = 0.43265


logits =
[[ 1. -2. -2. -2. -2.]
 [ 1. -2. -2. -2. -2.]
 [ 1. -2. -2. -2. -2.]]

prob = 
[[0.83 0.04 0.04 0.04 0.04]
 [0.83 0.04 0.04 0.04 0.04]
 [0.83 0.04 0.04 0.04 0.04]]

ground truth: [0 0 0]
loss = 0.18161


logits =
[[ 1. -3. -3. -3. -3.]
 [ 1. -3. -3. -3. -3.]
 [ 1. -3. -3. -3. -3.]]

prob = 
[[0.93 0.02 0.02 0.02 0.02]
 [0.93 0.02 0.02 0.02 0.02]
 [0.93 0.02 0.02 0.02 0.02]]

ground truth: [0 0 0]
loss = 0.07070


logits =
[[ 1. -4. -4. -4. -4.]
 [ 1. -4. -4. -4. -4.]
 [ 1. -4. -4. -4. -4.]]

prob = 
[[0.97 0.01 0.01 0.01 0.01]
 [0.97 0.01 0.01 0.01 0.01]
 [0.97 0.01 0.01 0.01 0.01]]

ground truth: [0 0 0]
loss = 0.02659


logits =
[[ 1. -5. -5. -5. -5.]
 [ 1. -5. -5. -5. -5.]
 [ 1. -5. -5. -5. -5.]]

prob = 
[[0.99 0.   0.   0.   0.  ]
 [0.99 0.   0.   0.   0.  ]
 [0.

In [129]:
n_samp = 3

for pluser in range(1,10):
    
    # ones except for col #0 which gradually increases
    logits = torch.ones(n_samp,5)
    logits[:,0] = pluser
    
    print(f"logits =\n{logits.numpy()}\n", sep='')
    prob = F.softmax(logits, dim=1)
    prob = torch.round(F.softmax(logits, dim=1), decimals = 2)
    print(f"prob = \n{prob.numpy()}\n", sep='')
    g_truth = torch.zeros(n_samp).to(torch.int64)
    print(f"ground truth: {g_truth.numpy()}")
    loss = F.cross_entropy(logits, g_truth)
    print(f"loss = {loss.numpy():.5f}\n\n======================\n")

logits =
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]

prob = 
[[0.2 0.2 0.2 0.2 0.2]
 [0.2 0.2 0.2 0.2 0.2]
 [0.2 0.2 0.2 0.2 0.2]]

ground truth: [0 0 0]
loss = 1.60944


logits =
[[2. 1. 1. 1. 1.]
 [2. 1. 1. 1. 1.]
 [2. 1. 1. 1. 1.]]

prob = 
[[0.4  0.15 0.15 0.15 0.15]
 [0.4  0.15 0.15 0.15 0.15]
 [0.4  0.15 0.15 0.15 0.15]]

ground truth: [0 0 0]
loss = 0.90483


logits =
[[3. 1. 1. 1. 1.]
 [3. 1. 1. 1. 1.]
 [3. 1. 1. 1. 1.]]

prob = 
[[0.65 0.09 0.09 0.09 0.09]
 [0.65 0.09 0.09 0.09 0.09]
 [0.65 0.09 0.09 0.09 0.09]]

ground truth: [0 0 0]
loss = 0.43265


logits =
[[4. 1. 1. 1. 1.]
 [4. 1. 1. 1. 1.]
 [4. 1. 1. 1. 1.]]

prob = 
[[0.83 0.04 0.04 0.04 0.04]
 [0.83 0.04 0.04 0.04 0.04]
 [0.83 0.04 0.04 0.04 0.04]]

ground truth: [0 0 0]
loss = 0.18161


logits =
[[5. 1. 1. 1. 1.]
 [5. 1. 1. 1. 1.]
 [5. 1. 1. 1. 1.]]

prob = 
[[0.93 0.02 0.02 0.02 0.02]
 [0.93 0.02 0.02 0.02 0.02]
 [0.93 0.02 0.02 0.02 0.02]]

ground truth: [0 0 0]
loss = 0.07070


logits =
[[6. 1. 1. 1. 1.

In [130]:
n_samp = 5
g_truth = torch.arange(n_samp)

def diag_logits(i):
    logits = i*torch.eye(n_samp) - torch.ones(n_samp)
    loss = F.cross_entropy(logits, g_truth)
    print(f"{loss = }")
    return logits

logits = diag_logits(2)
print('\nfirst logits:\n', logits.numpy(), '\n', sep='')

for mult in range(10,15):
    logits = diag_logits(mult)
print('last logits:\n', logits.numpy(), sep='')

loss = tensor(0.4327)

first logits:
[[ 1. -1. -1. -1. -1.]
 [-1.  1. -1. -1. -1.]
 [-1. -1.  1. -1. -1.]
 [-1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  1.]]

loss = tensor(0.0002)
loss = tensor(6.6755e-05)
loss = tensor(2.4700e-05)
loss = tensor(9.0599e-06)
loss = tensor(3.3379e-06)
last logits:
[[13. -1. -1. -1. -1.]
 [-1. 13. -1. -1. -1.]
 [-1. -1. 13. -1. -1.]
 [-1. -1. -1. 13. -1.]
 [-1. -1. -1. -1. 13.]]


In [131]:
#
# this gives the same result as adding a scalar to the softmax input doesn't change the outcome
#
n_samp = 5
g_truth = torch.arange(n_samp)

def diag_logits(i):
    logits = i*torch.eye(n_samp) 
    loss = F.cross_entropy(logits, g_truth)
    print(f"{loss = }")
    return logits

logits = diag_logits(2)
print('\nfirst logits:\n', logits.numpy(), '\n', sep='')

for mult in range(10,15):
    logits = diag_logits(mult)
print('last logits:\n', logits.numpy(), sep='')

loss = tensor(0.4327)

first logits:
[[2. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0.]
 [0. 0. 2. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 0. 0. 0. 2.]]

loss = tensor(0.0002)
loss = tensor(6.6755e-05)
loss = tensor(2.4700e-05)
loss = tensor(9.0599e-06)
loss = tensor(3.3379e-06)
last logits:
[[14.  0.  0.  0.  0.]
 [ 0. 14.  0.  0.  0.]
 [ 0.  0. 14.  0.  0.]
 [ 0.  0.  0. 14.  0.]
 [ 0.  0.  0.  0. 14.]]


In [132]:
#
#  batch indices!!!
#
N = 2
k = 3
d = 2

L = torch.arange(N * k * d * d).view(N, 3, 2, 2)
print(L)

tensor([[[[ 0,  1],
          [ 2,  3]],

         [[ 4,  5],
          [ 6,  7]],

         [[ 8,  9],
          [10, 11]]],


        [[[12, 13],
          [14, 15]],

         [[16, 17],
          [18, 19]],

         [[20, 21],
          [22, 23]]]])


In [133]:
index = torch.tensor([0, 1, 0, 0], dtype=torch.long)
batch_index = torch.tensor([0, 0, 1, 1])
print(f"L[batch_index, index]:\n{L[batch_index, index].numpy()}\n")
print(f"L[{batch_index[1]}, {index[1]}]:\n{L[batch_index[1], index[1]].numpy()}")

L[batch_index, index]:
[[[ 0  1]
  [ 2  3]]

 [[ 4  5]
  [ 6  7]]

 [[12 13]
  [14 15]]

 [[12 13]
  [14 15]]]

L[0, 1]:
[[4 5]
 [6 7]]


In [134]:
index = torch.tensor([0, 1, 0, 0]).view(N, -1)
# => tensor([[0, 1],
#            [0, 0]])

# Every batch gets its index and is repeated across dim=1
batch_index = torch.arange(N).view(N, 1).expand_as(index)
print(batch_index)
# => tensor([[0, 0],
#            [1, 1]])
# ALSO:
#
print(torch.arange(N).view(1,N).expand_as(index))
#
# or:
#
print(torch.arange(N).view(2,1).expand_as(torch.arange(8).reshape(2,-1)))

tensor([[0, 0],
        [1, 1]])
tensor([[0, 1],
        [0, 1]])
tensor([[0, 0, 0, 0],
        [1, 1, 1, 1]])


In [135]:
print(f"batch_index:\n{index.numpy()}\n")
print(f"index:\n{batch_index.numpy()}\n=================================\n")

for b,i in zip(batch_index, index):
    print(f"{L[b,i,:,:].numpy()}\n")

print(f"\n=================================\n\n{L[batch_index, index].numpy()}")

batch_index:
[[0 1]
 [0 0]]

index:
[[0 0]
 [1 1]]

[[[0 1]
  [2 3]]

 [[4 5]
  [6 7]]]

[[[12 13]
  [14 15]]

 [[12 13]
  [14 15]]]



[[[[ 0  1]
   [ 2  3]]

  [[ 4  5]
   [ 6  7]]]


 [[[12 13]
   [14 15]]

  [[12 13]
   [14 15]]]]


In [136]:
t1 = torch.arange(9)
t2 = torch.arange(9).reshape(3,3)

# because the dimensions are different:
print(torch.equal(t1, t2))

# views, however are the same (!!)
print(torch.equal(t1.view(3,3), t2))

# this gives size error:
#print(torch.allclose(t1, t2))

False
True


In [137]:
g = torch.Generator().manual_seed(1)
inp = torch.randn(1000, 9, generator=g)
w = torch.randn(9, 200, generator=g)

h = inp @ w

print(f"{inp.std()=}, {inp.mean()=}")
print(f"{inp.view(1,-1).std()=}, {inp.mean()=}")
print(f"{h.std()=}, {h.mean()=}\n")

print(f"{(h - h.mean()).std()=}")
print(f"{h.view(1,-1).std()=}")
print(f"{h.view(-1,1).std()=}")
print(f"{(h - h.mean(dim=1, keepdim=True)).std()=}")
print(f"{(h - h.mean(dim=0, keepdim=True)).std()=}")
print(f"{(h - h.mean(dim=0)).std()=}\n")


print(f"{h.view(-1,1).mean()=}")
print(f"{h.mean(dim=0).mean()=}")
print(f"{h.mean(dim=1).mean()=}")

inp.std()=tensor(1.0037), inp.mean()=tensor(-0.0031)
inp.view(1,-1).std()=tensor(1.0037), inp.mean()=tensor(-0.0031)
h.std()=tensor(3.0720), h.mean()=tensor(0.0054)

(h - h.mean()).std()=tensor(3.0720)
h.view(1,-1).std()=tensor(3.0720)
h.view(-1,1).std()=tensor(3.0720)
(h - h.mean(dim=1, keepdim=True)).std()=tensor(3.0631)
(h - h.mean(dim=0, keepdim=True)).std()=tensor(3.0701)
(h - h.mean(dim=0)).std()=tensor(3.0701)

h.view(-1,1).mean()=tensor(0.0054)
h.mean(dim=0).mean()=tensor(0.0054)
h.mean(dim=1).mean()=tensor(0.0054)


In [140]:
g = torch.Generator().manual_seed(1)
inp = torch.randn(100, 90, generator=g)
w = torch.randn(90, 20, generator=g)

h = inp @ w

print(f"{h.std()}")
print(f"{h.view(1,-1).std()}")
print(f"{h.view(-1,1).std()}")

9.951111793518066
9.951111793518066
9.951111793518066


In [141]:
g = torch.Generator().manual_seed(1)
inp = torch.randn(100, 9, generator=g)
w = torch.randn(9, 200, generator=g)

h = inp @ w

print(f"{inp.std()=}, {inp.mean()=}")
print(f"{h.std()=}, {h.mean()=}\n")

print(f"{(h - h.mean()).std()=}")
print(f"{h.view(1,-1).std()=}")
print(f"{h.view(-1,1).std()=}")
print(f"{(h - h.mean(dim=1, keepdim=True)).std()=}")
print(f"{(h - h.mean(dim=0, keepdim=True)).std()=}")
print(f"{(h - h.mean(dim=0)).std()=}\n")


print(f"{h.view(-1,1).mean()=}")
print(f"{h.mean(dim=0).mean()=}")
print(f"{h.mean(dim=1).mean()=}")

inp.std()=tensor(1.0196), inp.mean()=tensor(0.0216)
h.std()=tensor(3.1137), h.mean()=tensor(0.0078)

(h - h.mean()).std()=tensor(3.1137)
h.view(1,-1).std()=tensor(3.1137)
h.view(-1,1).std()=tensor(3.1137)
(h - h.mean(dim=1, keepdim=True)).std()=tensor(3.1103)
(h - h.mean(dim=0, keepdim=True)).std()=tensor(3.0867)
(h - h.mean(dim=0)).std()=tensor(3.0867)

h.view(-1,1).mean()=tensor(0.0078)
h.mean(dim=0).mean()=tensor(0.0078)
h.mean(dim=1).mean()=tensor(0.0078)


In [142]:
#
# randn(Nsamples, shape) (Gaussian random) 
#     is equivalent to a bunch of Gaussian samples of dim 1xN (N is "size" of shape (product of shape components))
#
g = torch.Generator().manual_seed(1)
inp1 = torch.randn(1000, 9, generator=g)

g = torch.Generator().manual_seed(1)
inp2 = torch.randn(9000, generator=g)

print(torch.allclose(inp1.view(1,-1), inp2))
print(torch.max((inp1.view(1,-1) - inp2).abs()))
torch.equal(inp1.reshape(9000), inp2)

True
tensor(0.)


True