# Linear Algebra

In [2]:
import torch

In [23]:
# Dot product of two VECTORS
# Dot product = (x)T * (x)
X=torch.randn((1,4))
Y=torch.randn((1,4))
print(X,Y)
torch.dot(X[0],Y[0])

tensor([[-1.1612, -0.1066,  0.7112, -2.0103]]) tensor([[ 0.7529, -0.9706, -0.0561,  1.1510]])


tensor(-3.1245)

In [None]:
# When we normalize the two vectors to have a unit length, their dot product tells the cosine of the angle between them.
# When weights are positive and sum to 1, then dot product is called weighted average.

In [30]:
# Matrix Vector product
A=torch.randn((2,3), dtype=torch.float32)
x=torch.arange(3, dtype=torch.float32)
print(A)
print(x)
y = torch.mv(A,x)
print(y, y.shape)

# A=[2,3], x=[3] -----> y=[2]
# A useful transformation from 3 to 2 dimension.

tensor([[-0.2197, -0.5776,  0.1494],
        [ 0.1942, -0.7180, -1.3790]])
tensor([0., 1., 2.])
tensor([-0.2787, -3.4760]) torch.Size([2])


In [31]:
# Matrix Matrix product
A=torch.randn(3,3)
B=torch.randn(3,4)
torch.mm(A,B)

tensor([[ 3.8165,  0.5862,  2.6685, -2.8223],
        [ 0.3464, -0.7907, -0.0780,  1.9494],
        [ 2.4999, -1.1593,  3.1194,  0.7055]])

In [34]:
# Norm refers to magnitude of componenets. Norm is always postive. Euclidean distance is an L2 norm.
# The L2 norm of x is the square root of the sum of the squares of the vector elements.
# The L1 norm of x is the sum of absolute value of vector elements.
a = torch.tensor([2.0,-5.0])
print(torch.norm(a)) # L2 Norm
print(torch.abs(a).sum()) # L1 Norm
# L1 norm is less influenced by errors.
# More general form of L1 and L2 norm is LP norm

tensor(5.3852)
tensor(7.)


In [35]:
# Frobenius Norm
# Frobenius norm of a matrix X ∈ R is the square root of the sum of the squares of the matrix elements.
# The Frobenius norm satisfies all the properties of vector norms. It behaves as if it were an L2 norm of a matrix-shaped vector
torch.norm(A)

tensor(3.2320)

In [None]:
# Objective function are often expressed as norms.

# Calculus

Automatic Differentation: Deep learning libraries automatically calculate derivative. System builds a computation graph, tracking which data combined with which operations to produce the output. Automatic differentiation enables the system to subsequently backpropagate gradients. Backpropagate simply means to trace through the computational graph, filling in the partial derivatives with respect to each parameter.

In [39]:
x = torch.arange(4.0, requires_grad=True)
y = 2*torch.dot(x,x)
print(x)
print(x.grad)
print(y)

tensor([0., 1., 2., 3.], requires_grad=True)
None
tensor(28., grad_fn=<MulBackward0>)


In [40]:
# We can automatically calculate the gradient of y with respect to each component of x by calling the function for 
# backpropagation and printing the gradient.
y.backward()
x.grad
# Derivative of y is 4x. Then call derivative for each component to get gradient.

tensor([ 0.,  4.,  8., 12.])

In [48]:
# Pytorch accumulates gradient, so we have to clear the previous gradients.
x.grad.zero_()

tensor([0., 0., 0., 0.])

For higher-order and higher-dimensional y and x, the differentiation result is a high-order tensor. We calculate
the derivatives of the loss functions for each constituent of a batch of training examples. Here, our
intent is not to calculate the differentiation matrix but rather the sum of the partial derivatives
computed individually for each example in the batch.


In [None]:
# Detaching computation: To move some computations outside of computation graph, we detach it.
x.grad.zero_()
y = x * x
u = y.detach() # Detaching y
z = u * x