In [52]:
import numpy as np , pandas as pd
import torch


In [6]:
# shares same memory .. This is actually good
y= x
y = x * 2
print(x,id(x))
print(y,id(y))


tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.]) 4391781536
tensor([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20., 22.]) 4827927136


In [7]:
# deep copy with clone
y = x.clone()
print(id(x),id(y))

4391781536 4827205920


In [9]:
x = torch.arange(10)
torch.sum(x)

tensor(45)

In [20]:
x = torch.arange(12)
x = x.reshape(3,4)
col_sum = x.sum(axis=1,keepdims=True)
row_sum = x.sum(axis=0,keepdims=True)
tot_sum = x.sum(axis=[0,1],keepdims=True)
tot_sum

tensor([[66]])

In [21]:
row_sum

tensor([[12, 15, 18, 21]])

In [22]:
col_sum

tensor([[ 6],
        [22],
        [38]])

In [24]:
# dot products
x = torch.randint(low=1,high=10,size=(10,1))
y = torch.randint(low=1,high=10,size=(10,1))
x.T @ y

tensor([[350]])

In [28]:
# better than :
try:
    torch.dot(x,y)
except RuntimeError:
    print("torch thinks this as a 2d tensor. so it would not do the dot product on it 🤦")

torch thinks this as a 2d tensor. so it would not do the dot product on it 🤦


In [44]:
# so its just better to use @ to do linear algebra
# norm
x = torch.arange(1,12,dtype=torch.float32)
torch.linalg.vector_norm(x)

tensor(22.4944)

In [47]:
# norm of matrices
x = torch.arange(1,16)
X = x.reshape(3,5)
# norm <X,X> = trace(X* X)
torch.sqrt( torch.trace( torch.conj(X).T @ X))

tensor(35.2136)

In [48]:
X = torch.arange(1,16,dtype=torch.float64)
X = X.reshape(3,5)
torch.linalg.matrix_norm(X)

tensor(35.2136, dtype=torch.float64)

In [53]:
# automatic differentiation
x = torch.arange(4.0)
x

tensor([0., 1., 2., 3.])

In [54]:
# Can also create x = torch.arange(4.0, requires_grad=True)
x.requires_grad_(True)
x.grad  # The gradient is None by default

In [55]:
y = 2 * torch.dot(x, x) # 2 * sum(x^2) . d/dx (2x^2) = 4x . grad = 4 *x 
y

tensor(28., grad_fn=<MulBackward0>)

In [56]:
y.backward()

In [62]:
assert all(x.grad) == all(4*x) , "(d/dx 2x'x) = 4x"
x.grad

tensor([ 0.,  4.,  8., 12.])

In [63]:
# Now let’s calculate another function of x and take its gradient. Note that PyTorch does not automatically reset the gradient buffer when we record a new gradient.

In [65]:
x.grad.zero_()  # Reset the gradient

y = x.sum() # y = sum(x) . d/dx (x) = 1 => grad should be all 1's
y.backward()
x.grad

tensor([1., 1., 1., 1.])

In [68]:
# non-scalar tensors
# we calculate x.T d/dx (y) instead of just d/dx (y)

# reset grad
x.grad.zero_()

y = x * x # objective function does not reduce
# so the below line just tells optimize y.T @ y instead . d/dx (x*2) = 2x
y.backward(gradient=torch.ones(len(y)))  # Faster: y.sum().backward()
x.grad

tensor([0., 2., 4., 6.])

In [72]:
# detaching 
# if z = ux & u = x^2 . then you would assume d/dx (z) = 3x^2
# but with detaching you would get d/dx(z) = u

x.grad.zero_()
u = x * x
z = u * x
obj = z.sum() # making objective f(.) scalar

obj.backward()
x.grad # this is 3x^2

tensor([ 0.,  3., 12., 27.])

In [74]:
# with detaching
x.grad.zero_()

u = x * x
y = u.detach()
z = y * x
obj = z.sum()

obj.backward()
x.grad # d/dx (z) = y = u = x^2

tensor([0., 1., 4., 9.])