In [1]:
import torch
import numpy as np
import matplotlib
import pandas

import time

In [2]:
device = torch.device("cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")

In [3]:
tensor = torch.rand(3, 4)

print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")
tensor = tensor.to(device)
print(f"Device tensor is stored on: {tensor.device}")

Shape of tensor: torch.Size([3, 4])
Datatype of tensor: torch.float32
Device tensor is stored on: cpu
Device tensor is stored on: mps:0


In [4]:
n = np.ones(5)
t = torch.from_numpy(n)
t

tensor([1., 1., 1., 1., 1.], dtype=torch.float64)

In [7]:
device = 'mps'

torch.manual_seed(1234)
TENSOR_A_CPU = torch.rand(5000, 50000)
TENSOR_B_CPU = torch.rand(50000, 5000)

torch.manual_seed(1234)
TENSOR_A_MPS = torch.rand(5000, 50000).to(device)
TENSOR_B_MPS = torch.rand(50000, 5000).to(device)

# Warm-up
for _ in range(100):
    torch.matmul(torch.rand(500,500).to(device), torch.rand(500,500).to(device))
    
start_time = time.time()
torch.matmul(TENSOR_A_CPU, TENSOR_B_CPU)
cpu = (time.time() - start_time)
print("CPU : --- %s seconds ---" % cpu)

start_time = time.time()
torch.matmul(TENSOR_A_MPS, TENSOR_B_MPS)
mps = (time.time() - start_time)
print("MPS : --- %s seconds ---" % mps)

print(f"Speedup on MPS :{cpu/mps:.1f}")


CPU : --- 2.8122100830078125 seconds ---
MPS : --- 0.00020503997802734375 seconds ---
Speedup on MPS :13715.4


In [8]:
from torchvision.models import resnet18, ResNet18_Weights
model = resnet18(weights=ResNet18_Weights.DEFAULT)
data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 1000)

In [9]:
prediction = model(data) # forward pass

In [10]:
loss = (prediction - labels).sum()
loss.backward() # backward pass

In [11]:
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [12]:
optim.step() #gradient descent

# Differentiation in pytorch

Let’s take a look at how autograd collects gradients. We create two tensors a and b with requires_grad=True. This signals to autograd that every operation on them should be tracked.

In [19]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

We create another tensor `Q` from `a` and `b`.

$$Q = 3a^3 - b^2$$


In [20]:
Q = 3*a**3 - b**2

Let\'s assume `a` and `b` to be parameters of an NN, and `Q` to be the
error. In NN training, we want gradients of the error w.r.t. parameters,
i.e.

$$\frac{\partial Q}{\partial a} = 9a^2$$

$$\frac{\partial Q}{\partial b} = -2b$$

When we call `.backward()` on `Q`, autograd calculates these gradients
and stores them in the respective tensors\' `.grad` attribute.

We need to explicitly pass a `gradient` argument in `Q.backward()`
because it is a vector. `gradient` is a tensor of the same shape as `Q`,
and it represents the gradient of Q w.r.t. itself, i.e.

$$\frac{dQ}{dQ} = 1$$

Equivalently, we can also aggregate Q into a scalar and call backward
implicitly, like `Q.sum().backward()`.


In [22]:
# Q.backward() # will throw "RuntimeError: grad can be implicitly created only for scalar outputs"

# So
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

Gradients are now deposited in `a.grad` and `b.grad`

In [17]:
# check if collected gradients are correct
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


In [18]:
external_grad

tensor([1., 1.])

# Some more notes from a youtube series

