In [1]:
import torch
import numpy as np


if torch.cuda.is_available():
    print(torch.cuda.get_device_name())

NVIDIA GeForce RTX 2060


# 1. Tensors

## 1.1 Create Tensors

Tensors should stay on same GPU as I operate on them.

In [2]:
np_arr = np.array([[1, 2], [3, 4]])
tensor_a = torch.from_numpy(np_arr).to('cuda:0')
tensor_b = torch.ones_like(tensor_a)
tensor_c = torch.rand_like(tensor_b, dtype=torch.float)

tensor_a.device, tensor_b.device, tensor_c.device

(device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0))

By default, they should be on CPU.

In [3]:
shape = (2, 3)
tensor_a = torch.rand(shape)
tensor_b = torch.ones(shape)
tensor_c = torch.zeros(shape)

tensor_a.device, tensor_b.device, tensor_c.device

(device(type='cpu'), device(type='cpu'), device(type='cpu'))

In [4]:
tensor_a

tensor([[0.5070, 0.7800, 0.3710],
        [0.6524, 0.0812, 0.5744]])

In [5]:
tensor_a.shape, tensor_a.dtype, tensor_a.device

(torch.Size([2, 3]), torch.float32, device(type='cpu'))

In [6]:
tensor_a.view(-1, 6) # Same as reshape

tensor([[0.5070, 0.7800, 0.3710, 0.6524, 0.0812, 0.5744]])

In [7]:
tensor_a.permute(1, 0) # Permute like a tranpose

tensor([[0.5070, 0.6524],
        [0.7800, 0.0812],
        [0.3710, 0.5744]])

In [8]:
tensor_a.T # Transpose

tensor([[0.5070, 0.6524],
        [0.7800, 0.0812],
        [0.3710, 0.5744]])

In [9]:
tensor_a.view(-1, 6).expand(3, 6) # Same as broadcast

tensor([[0.5070, 0.7800, 0.3710, 0.6524, 0.0812, 0.5744],
        [0.5070, 0.7800, 0.3710, 0.6524, 0.0812, 0.5744],
        [0.5070, 0.7800, 0.3710, 0.6524, 0.0812, 0.5744]])

In [10]:
tensor_a.view(-1, 6).squeeze(0)

tensor([0.5070, 0.7800, 0.3710, 0.6524, 0.0812, 0.5744])

In [11]:
tensor_a.view(6).unsqueeze(-1) # Flatten it and then unsqueeze it.

tensor([[0.5070],
        [0.7800],
        [0.3710],
        [0.6524],
        [0.0812],
        [0.5744]])

In [12]:
tensor_d = torch.tensor([1, 2, 3]).expand(2, 3).clone() # We must clone to allocate new memory.
tensor_e = torch.tensor([2, 1])
tensor_d[:, 0:2].add_(tensor_e) # In place operation
tensor_d

tensor([[3, 3, 3],
        [3, 3, 3]])

## 1.2 Tensor Math

In [13]:
tensor_a = torch.tensor([[1, 1], [2, 2]])
tensor_b = torch.tensor([[3, 3], [4, 4]])
tensor_a @ tensor_b

tensor([[ 7,  7],
        [14, 14]])

In [14]:
tensor_a * tensor_b

tensor([[3, 3],
        [8, 8]])

In [15]:
tensor_a + tensor_b

tensor([[4, 4],
        [6, 6]])

In [16]:
tensor_a - tensor_b

tensor([[-2, -2],
        [-2, -2]])

## 1.3 Numpy & Tensor Share Memory

In [17]:
tensor_a = torch.ones(5)
np_a = tensor_a.numpy()
tensor_a.add_(1)
np_a # Numpy array is also modified.

array([2., 2., 2., 2., 2.], dtype=float32)

In [18]:
np_b = np.ones(5)
tensor_b = torch.from_numpy(np_b)
np.add(np_b, 1, out=np_b)
tensor_b # Tensor is also modified.

tensor([2., 2., 2., 2., 2.], dtype=torch.float64)

# 2. Autograd

## 2.1 Auto Differentiaion

$$
f(x_1, x_2) = \sin(x_1) + \cos(x_2)
$$

The derivative with respect to $x_1$.

$$
\frac{\partial f}{\partial x_1} = \cos(x_1)
$$

The derivative with respect to $x_2$.

$$
\frac{\partial f}{\partial x_2} = -\sin(x_2)
$$

In [19]:
x1 = torch.arange(0, torch.pi, 0.01, requires_grad=True)
x2 = torch.arange(torch.pi, 2 * torch.pi, 0.01, requires_grad=True)
f = torch.sin(x1) + torch.cos(x2)

In [20]:
external_grad = torch.ones_like(f)
f.backward(gradient=external_grad)

In [21]:
torch.all(x1.grad == torch.cos(x1))

tensor(True)

In [22]:
torch.all(x2.grad == -1 * torch.sin(x2))

tensor(True)

## 2.2 Jacobian

If I have a vector valued function,

$$
\vec{y} = f(\vec{x})
$$

then the gradient of $\vec{y}$ with respect to the $\vec{x}$ is a Jacobian matrix.

$$
\def\d{\partial}
\frac{\d \vec{y}}{\d \vec{x}} =
\begin{bmatrix}
\frac{\d y_1}{\d x_1} & \dots & \frac{\d y_1}{\d x_n} \\
\vdots & \ddots & \vdots \\
\frac{\d y_m}{\d x_1} & \dots & \frac{\d y_m}{\d x_n}
\end{bmatrix}
$$

Here's an example

$$
A \vec{x} = \vec{y}
$$

In [23]:
A = torch.rand((3, 4), requires_grad=True)
x = torch.rand((4, 1), requires_grad=True)
y = A @ x
y.retain_grad() # y is not a leaf node, thus I have to manually call retain grad.
y.sum().backward()

`backward()` only works on scalar. The scalar I have is a summation of all elements of $\vec{y}$.

$$
L = \sum_i^m y_i
$$

Then 

$$
\frac{\partial L}{\partial \vec{y}} = \begin{bmatrix} 1 \\ 1 \\ 1 \end{bmatrix}
$$

In [24]:
y.grad

tensor([[1.],
        [1.],
        [1.]])

Also,

$$
\def\d{\partial}
\frac{\d L}{\d \vec{x}} = \frac{\d \vec{y}}{\d \vec{x}}\frac{\d L}{\d \vec{y}} = 
J^\intercal \frac{\d L}{\d \vec{y}} =
A^\intercal \begin{bmatrix} 1 \\ 1 \\ 1 \end{bmatrix}
$$

In [25]:
torch.all(A.T @ y.grad == x.grad)

tensor(True)

In this sense, my Jacobian is actually just the $A$ matrix.

$$
\def\d{\partial}
A = J =
\begin{bmatrix}
\frac{\d y_1}{\d x_1} & \dots  & \dots & \frac{\d y_1}{\d x_4}  \\
\vdots & \ddots & \ddots & \vdots \\
\frac{\d y_3}{\d x_1} & \dots & \dots & \frac{\d y_3}{\d x_4}
\end{bmatrix}
$$

## 2.3 Computational Graph

A DAG is created when `backward()` is called. The node which I call `backward()` is the root node. I may be able to call multiple backward on multiple nodes. They are all considered roots. The input tensors or weight tensors are the leave nodes. In the example above, `A` is a leaf node, and `x` is also a leaf node. However, `y` is not a leaf node. It does not carry its own gradients unless I specify it to do so.

## 2.4 Exclusion from DAG

We can manually stop gradients from accumulating on certain leaf nodes. This is PyTorch's way to freeze parts of a model.

In [26]:
from torch import nn, optim
from torchvision.models import resnet18, ResNet18_Weights

model = resnet18(weights=ResNet18_Weights.DEFAULT)
for param in model.parameters():
    param.requires_grad = False
model.fc = nn.Linear(512, 10) # Only train the head.
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

# 3. Module

## 3.1 Define Network

I will build a ResNet-9 here.

Note: Batch Normalization layers normalize the activations of the network so that they maintain a mean activation close to 0 and standard deviation close to 1. Because of this normalization, any bias added by a convolutional layer before a BatchNorm layer would be removed, making the bias unnecessary.

https://stackoverflow.com/questions/46256747/can-not-use-both-bias-and-batch-normalization-in-convolution-layers

In [102]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class ResBlock(nn.Module):
    def __init__(self, in_chans, out_chans, stride=1):
        super(ResBlock, self).__init__()
        # First conv1 performs downsampling if stride > 1.
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_chans, out_chans, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_chans, momentum=0.9),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_chans, out_chans, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_chans, momentum=0.9),
            nn.ReLU(),
        )
        # When we downsample, we also need to downsample the residual.
        if stride != 1:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_chans, out_chans, kernel_size=1, stride=stride, padding=0, bias=False),
                nn.BatchNorm2d(out_chans, momentum=0.9)
            )
        else:
            self.downsample = None

        self.relu = nn.ReLU()
    
    def forward(self, x):
        residual = x
        x = self.conv1(x)
        x = self.conv2(x)
        if self.downsample:
            residual = self.downsample(residual)
        x = x + residual
        out = self.relu(x)
        return out


res_block = ResBlock(3, 128, stride=2)
res_block(torch.rand((1, 3, 48, 48))).shape

torch.Size([1, 128, 24, 24])

In [100]:
class BottleneckResBlock(nn.Module):
    def __init__(self, in_chans, mid_chans, out_chans, stride=1):
        super(BottleneckResBlock, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_chans, mid_chans, kernel_size=1, stride=stride, padding=0, bias=False),
            nn.BatchNorm2d(mid_chans, momentum=0.9),
            nn.ReLU(),
        )
        # Squeeze the inputs with mid_chans < in_chans to create a bottle neck.
        self.conv2 = nn.Sequential(
            nn.Conv2d(mid_chans, mid_chans, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(mid_chans, momentum=0.9),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(mid_chans, out_chans, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(out_chans, momentum=0.9),
            nn.ReLU(),
        )
        if stride != 1:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_chans, out_chans, kernel_size=1, stride=stride, padding=0, bias=False),
                nn.BatchNorm2d(out_chans, momentum=0.9)
            )
        else:
            self.downsample = None
        
        self.relu = nn.ReLU()

    def forward(self, x):
        residual = x
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        if self.downsample:
            residual = self.downsample(residual)
        x = x + residual
        out = self.relu(x)
        return out
        
    
res_block = BottleneckResBlock(256, 64, 128, stride=2)
res_block(torch.rand((1, 256, 48, 48))).shape

torch.Size([1, 128, 24, 24])

In [110]:
class ResNet9(nn.Module):
    def __init__(self):
        super(ResNet9, self).__init__()
        self.conv_stack_1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=64, momentum=0.9),
            nn.ReLU(inplace=True), # We can save some memory here, since we don't worry about residuals here.
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=128, momentum=0.9),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.res_block_2 = ResBlock(128, 128)
        self.conv_stack_3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=256, momentum=0.9),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=256, momentum=0.9),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.res_block_4 = ResBlock(256, 256)
        self.max_pool_5 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc = nn.Linear(1024, 10, bias=True)
    
    def forward(self, x):
        x = self.conv_stack_1(x)
        x = self.res_block_2(x)
        x = self.conv_stack_3(x)
        x = self.res_block_4(x)
        x = self.max_pool_5(x)
        out = self.fc(x.view(-1, 1024))
        return out


net = ResNet9()
out = net(torch.rand((2, 3, 32, 32)))
net.zero_grad() # Reset gradients
for param in net.parameters():
    print(param.grad)

out.backward(torch.rand_like(out))
for param in net.parameters():
    print(param.grad.shape)

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
torch.Size([64, 3, 3, 3])
torch.Size([64])
torch.Size([64])
torch.Size([128, 64, 3, 3])
torch.Size([128])
torch.Size([128])
torch.Size([128, 128, 3, 3])
torch.Size([128])
torch.Size([128])
torch.Size([128, 128, 3, 3])
torch.Size([128])
torch.Size([128])
torch.Size([256, 128, 3, 3])
torch.Size([256])
torch.Size([256])
torch.Size([256, 256, 3, 3])
torch.Size([256])
torch.Size([256])
torch.Size([256, 256, 3, 3])
torch.Size([256])
torch.Size([256])
torch.Size([256, 256, 3, 3])
torch.Size([256])
torch.Size([256])
torch.Size([10, 1024])
torch.Size([10])
