In [226]:
import torch
import torch.nn as nn
import numpy as np
import math

# torch.nn

https://pytorch.org/docs/stable/nn.html

https://pytorch.org/tutorials/beginner/nn_tutorial.html

https://blog.paperspace.com/pytorch-101-advanced/

# Example: Low-level vs. High-level Linear Regression Model

## Create Sample Data

In [227]:
X = torch.tensor(np.array([np.arange(1, 10.6, 0.2),
                           np.arange(1, 10.6, 0.2),
                           np.arange(1, 10.6, 0.2)]), dtype=torch.float32).t()
X.shape

torch.Size([48, 3])

In [228]:
torch.manual_seed(1)

y = X.matmul(torch.tensor([30.7, -17.2, 23.3])) + 19.2 + 0.5*torch.randn(X.shape[0])
y.shape

torch.Size([48])

In [229]:
X[:10, :], y[:10]

(tensor([[1.0000, 1.0000, 1.0000],
         [1.2000, 1.2000, 1.2000],
         [1.4000, 1.4000, 1.4000],
         [1.6000, 1.6000, 1.6000],
         [1.8000, 1.8000, 1.8000],
         [2.0000, 2.0000, 2.0000],
         [2.2000, 2.2000, 2.2000],
         [2.4000, 2.4000, 2.4000],
         [2.6000, 2.6000, 2.6000],
         [2.8000, 2.8000, 2.8000]]),
 tensor([ 55.2372,  62.9849,  70.3930,  77.2753,  85.3899,  92.4954,  99.6701,
         106.7155, 114.5239, 122.3919]))

# 1. Low-Level

### Weights and Bias

In [230]:
# low-level

torch.manual_seed(1)

weights = torch.randn(3) / math.sqrt(len(X))  # xavier initialization
weights.requires_grad_(True)    # set require_grad after initialization bc. we don't want this included in gradient

bias = torch.randn(1, requires_grad=True)

weights, bias

(tensor([0.0955, 0.0385, 0.0089], requires_grad=True),
 tensor([0.6213], requires_grad=True))

## Model and Loss function

In [231]:
def linear_reg(X, weights=weights, bias=bias):
    pred = X.matmul(weights) + bias
    return pred

def mse_loss(pred, y):
    loss = torch.sum((pred - y)**2) / len(y)
    return loss

## Batch

In [232]:
batch_size = 8

X_batches = torch.split(X, batch_size)
y_batches = torch.split(y, batch_size)

len(X_batches), len(y_batches)

(6, 6)

## Train

In [233]:
epochs = 10


def train(X_batches, y_batches, model, loss_fn, weights, bias, lr = 0.001, lr_decay = 0.99):
    for batch_num, (X, y) in enumerate(zip(X_batches, y_batches)):
        pred = model(X, weights, bias)
        loss = loss_fn(pred, y)

        loss.backward()
        with torch.no_grad():   # we don't want these actions to be recorded for next calculation of the gradient
            # SGD
            weights -= lr * weights.grad
            bias -= lr * bias.grad

            # reset gradient
            weights.grad.zero_()
            bias.grad.zero_()

        lr *= lr_decay

        if (batch_num + 1) % 2 == 0:
                print(f'batch [{batch_num + 1} / {len(X_batches)}] loss: {loss.item()}')


for epoch in range(epochs):
    print('-' * 50)
    print(f'epoch {epoch + 1} / {epochs}')
    print('-' * 50)

    train(X_batches, y_batches, linear_reg, mse_loss, weights, bias)

--------------------------------------------------
epoch 1 / 10
--------------------------------------------------
batch [2 / 6] loss: 18928.08984375
batch [4 / 6] loss: 39545.0234375
batch [6 / 6] loss: 17308.240234375
--------------------------------------------------
epoch 2 / 10
--------------------------------------------------
batch [2 / 6] loss: 953.0055541992188
batch [4 / 6] loss: 1121.216064453125
batch [6 / 6] loss: 300.2220458984375
--------------------------------------------------
epoch 3 / 10
--------------------------------------------------
batch [2 / 6] loss: 171.156494140625
batch [4 / 6] loss: 35.11738586425781
batch [6 / 6] loss: 4.091669082641602
--------------------------------------------------
epoch 4 / 10
--------------------------------------------------
batch [2 / 6] loss: 102.42078399658203
batch [4 / 6] loss: 2.9902000427246094
batch [6 / 6] loss: 25.7353572845459
--------------------------------------------------
epoch 5 / 10
-----------------------------

# 2. High-level

## Using torch.nn.functional

https://pytorch.org/docs/stable/nn.functional.html

In [234]:
import torch.nn.functional as F

loss_fn = F.mse_loss

loss_fn(linear_reg(X, weights, bias), y), mse_loss(linear_reg(X, weights, bias), y)     # same

(tensor(52.5023, grad_fn=<MseLossBackward0>),
 tensor(52.5023, grad_fn=<DivBackward0>))

## Using nn.Module & nn.Parameter

https://pytorch.org/docs/stable/generated/torch.nn.Module.html

https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html

In [235]:
torch.manual_seed(1)

class LinearReg(nn.Module):
    def __init__(self):
        super().__init__()
        # nn.Parameter has requires_grad=True as default
        self.weights = nn.Parameter(torch.randn(3) / math.sqrt(len(X)))     # xavier initialization
        self.bias = nn.Parameter(torch.randn(1))

    def forward(self, X):
        return X.matmul(self.weights) + self.bias


model = LinearReg()

print(model)
print('-' * 50)
print(list(model.parameters()))     # added to parameters() bc. nn.Parameter is used

LinearReg()
--------------------------------------------------
[Parameter containing:
tensor([0.0955, 0.0385, 0.0089], requires_grad=True), Parameter containing:
tensor([0.6213], requires_grad=True)]


In [236]:
epochs = 10


def train(X_batches, y_batches, model, loss_fn, lr = 0.001, lr_decay = 0.99):
    for batch_num, (X, y) in enumerate(zip(X_batches, y_batches)):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        with torch.no_grad():   # we don't want these actions to be recorded for next calculation of the gradient
            # changed part
            for p in model.parameters():
                p -= lr * p.grad
            model.zero_grad()

        lr *= lr_decay

        if (batch_num + 1) % 2 == 0:
                print(f'batch [{batch_num + 1} / {len(X_batches)}] loss: {loss.item()}')


for epoch in range(epochs):
    print('-' * 50)
    print(f'epoch {epoch + 1} / {epochs}')
    print('-' * 50)

    train(X_batches, y_batches, model, loss_fn)

--------------------------------------------------
epoch 1 / 10
--------------------------------------------------
batch [2 / 6] loss: 18928.08984375
batch [4 / 6] loss: 39545.0234375
batch [6 / 6] loss: 17308.240234375
--------------------------------------------------
epoch 2 / 10
--------------------------------------------------
batch [2 / 6] loss: 953.0055541992188
batch [4 / 6] loss: 1121.216064453125
batch [6 / 6] loss: 300.2220458984375
--------------------------------------------------
epoch 3 / 10
--------------------------------------------------
batch [2 / 6] loss: 171.156494140625
batch [4 / 6] loss: 35.11738586425781
batch [6 / 6] loss: 4.091669082641602
--------------------------------------------------
epoch 4 / 10
--------------------------------------------------
batch [2 / 6] loss: 102.42078399658203
batch [4 / 6] loss: 2.9902000427246094
batch [6 / 6] loss: 25.7353572845459
--------------------------------------------------
epoch 5 / 10
-----------------------------

## Using nn.Linear

In [237]:
torch.manual_seed(1)

class LinearReg(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(3,1)

    def forward(self, X):
        return self.linear(X).squeeze()


model = LinearReg()

print(model)
print('-' * 50)
print(list(model.parameters()))     # added to parameters bc. nn.Parameter is used

LinearReg(
  (linear): Linear(in_features=3, out_features=1, bias=True)
)
--------------------------------------------------
[Parameter containing:
tensor([[ 0.2975, -0.2548, -0.1119]], requires_grad=True), Parameter containing:
tensor([0.2710], requires_grad=True)]


In [238]:
epochs = 10


def train(X_batches, y_batches, model, loss_fn, lr = 0.001, lr_decay = 0.99):   # same train function as before
    for batch_num, (X, y) in enumerate(zip(X_batches, y_batches)):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        with torch.no_grad():
            for p in model.parameters():
                p -= lr * p.grad
            model.zero_grad()

        lr *= lr_decay

        if (batch_num + 1) % 2 == 0:
                print(f'batch [{batch_num + 1} / {len(X_batches)}] loss: {loss.item()}')


for epoch in range(epochs):
    print('-' * 50)
    print(f'epoch {epoch + 1} / {epochs}')
    print('-' * 50)

    train(X_batches, y_batches, model, loss_fn, lr_decay=1)

--------------------------------------------------
epoch 1 / 10
--------------------------------------------------
batch [2 / 6] loss: 19212.021484375
batch [4 / 6] loss: 39723.33203125
batch [6 / 6] loss: 16121.7109375
--------------------------------------------------
epoch 2 / 10
--------------------------------------------------
batch [2 / 6] loss: 850.8599853515625
batch [4 / 6] loss: 924.115478515625
batch [6 / 6] loss: 207.44674682617188
--------------------------------------------------
epoch 3 / 10
--------------------------------------------------
batch [2 / 6] loss: 161.54225158691406
batch [4 / 6] loss: 25.274185180664062
batch [6 / 6] loss: 7.740100860595703
--------------------------------------------------
epoch 4 / 10
--------------------------------------------------
batch [2 / 6] loss: 104.6955337524414
batch [4 / 6] loss: 2.715043783187866
batch [6 / 6] loss: 27.675636291503906
--------------------------------------------------
epoch 5 / 10
--------------------------

## Using torch.optim

In [248]:
from torch.optim import SGD

torch.manual_seed(1)

model = LinearReg()     # same model as before
optimizer = SGD(model.parameters(), lr=0.001)
epochs = 10


def train(X_batches, y_batches, model, loss_fn, optimizer):
    for batch_num, (X, y) in enumerate(zip(X_batches, y_batches)):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        # changed part
        optimizer.step()
        optimizer.zero_grad()

        if (batch_num + 1) % 2 == 0:
                print(f'batch [{batch_num + 1} / {len(X_batches)}] loss: {loss.item()}')


for epoch in range(epochs):
    print('-' * 50)
    print(f'epoch {epoch + 1} / {epochs}')
    print('-' * 50)

    train(X_batches, y_batches, model, loss_fn, optimizer)

--------------------------------------------------
epoch 1 / 10
--------------------------------------------------
batch [2 / 6] loss: 19212.021484375
batch [4 / 6] loss: 39723.33203125
batch [6 / 6] loss: 16121.7109375
--------------------------------------------------
epoch 2 / 10
--------------------------------------------------
batch [2 / 6] loss: 850.8602294921875
batch [4 / 6] loss: 924.115478515625
batch [6 / 6] loss: 207.44674682617188
--------------------------------------------------
epoch 3 / 10
--------------------------------------------------
batch [2 / 6] loss: 161.54229736328125
batch [4 / 6] loss: 25.274185180664062
batch [6 / 6] loss: 7.740100860595703
--------------------------------------------------
epoch 4 / 10
--------------------------------------------------
batch [2 / 6] loss: 104.6955337524414
batch [4 / 6] loss: 2.715043783187866
batch [6 / 6] loss: 27.675636291503906
--------------------------------------------------
epoch 5 / 10
--------------------------

## Using Datasets & Dataset

https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset

In [241]:
from torch.utils.data import Dataset, TensorDataset

# choice 1
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return X[index, :], y[index]


# dataset = CustomDataset(X, y)

# choice 2
dataset = TensorDataset(X,y)

dataset[0], dataset[-1:]    # indexing & slicing dataset

((tensor([1., 1., 1.]), tensor(55.2372)),
 (tensor([[10.4000, 10.4000, 10.4000]]), tensor([401.6051])))

In [242]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size, shuffle=True)  # don't need to shuffle for validation data

len(dataloader), len(dataloader.dataset)

(6, 48)

In [250]:
torch.manual_seed(1)

model = LinearReg()     # same model as before
optimizer = SGD(model.parameters(), lr=0.001)
epochs = 10


def train(dataloader, model, loss_fn, optimizer):
    for batch_num, (X, y) in enumerate(dataloader):     # changed part
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if (batch_num + 1) % 2 == 0:
                print(f'batch [{batch_num + 1} / {len(X_batches)}] loss: {loss.item()}')


for epoch in range(epochs):
    print('-' * 50)
    print(f'epoch {epoch + 1} / {epochs}')
    print('-' * 50)

    train(dataloader, model, loss_fn, optimizer)

--------------------------------------------------
epoch 1 / 10
--------------------------------------------------
batch [2 / 6] loss: 43860.66015625
batch [4 / 6] loss: 6618.982421875
batch [6 / 6] loss: 1878.8720703125
--------------------------------------------------
epoch 2 / 10
--------------------------------------------------
batch [2 / 6] loss: 701.1918334960938
batch [4 / 6] loss: 560.6073608398438
batch [6 / 6] loss: 179.83984375
--------------------------------------------------
epoch 3 / 10
--------------------------------------------------
batch [2 / 6] loss: 68.46882629394531
batch [4 / 6] loss: 42.12972640991211
batch [6 / 6] loss: 62.00067138671875
--------------------------------------------------
epoch 4 / 10
--------------------------------------------------
batch [2 / 6] loss: 69.39556121826172
batch [4 / 6] loss: 26.955913543701172
batch [6 / 6] loss: 51.885169982910156
--------------------------------------------------
epoch 5 / 10
-------------------------------

## Using nn.Sequential

https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html

nn.Sequential only takes instances of nn.Module as parameters -> we need to be able to make custom layers to take advantage of nn.Sequential

nn.Sequential treats the whole container as a single module

In [339]:
# create custom layer
class Lambda(nn.Module):
    def __init__(self, func) -> None:
        super().__init__()
        self.func = func

    def forward(self, x):
        return self.func(x)


model = nn.Sequential(Lambda(lambda x: x.view(-1, 1, 28, 28)),  # a custom view layer can be added to nn.Sequential
                      nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
                      Lambda(lambda x: x.view(x.size(0), -1)))

model

Sequential(
  (0): Lambda()
  (1): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (2): Lambda()
)

In [245]:
list(model.parameters())[0].shape   # kernels of CNN layer (out channels, in channel, kernel size, kernel size)

torch.Size([16, 1, 3, 3])

# nn.ModuleList

In [274]:
torch.manual_seed(1)

layer_list = [nn.Linear(3,3), nn.Linear(3,1)]

class Net(nn.Module):
    def __init__(self, layer_list) -> None:
        super().__init__()
        self.layers = layer_list    # parameters of Modules in layer_list are not registered to model.parameters()

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x.squeeze()


model = Net(layer_list)
print(model, list(model.parameters()))

try:
    optimizer = SGD(model.parameters(), lr=0.001)   # training won't work bc. model.parameters() is empty
except:
    print('Error: model.parameters() is empty, training is not possible')


Net() []
Error: model.parameters() is empty, training is not possible


In [324]:
torch.manual_seed(1)

layer_list = [nn.Linear(3,3), nn.Linear(3,1)]

class Net(nn.Module):
    def __init__(self, layer_list) -> None:
        super().__init__()
        self.layers = nn.ModuleList(layer_list)    # parameters are registered to model.parameters() by using nn.ModuleList

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x.squeeze()


model = Net(layer_list)
print(model, list(model.parameters()))

Net(
  (layers): ModuleList(
    (0): Linear(in_features=3, out_features=3, bias=True)
    (1): Linear(in_features=3, out_features=1, bias=True)
  )
) [Parameter containing:
tensor([[ 0.2975, -0.2548, -0.1119],
        [ 0.2710, -0.5435,  0.3462],
        [-0.1188,  0.2937,  0.0803]], requires_grad=True), Parameter containing:
tensor([-0.0707,  0.1601,  0.0285], requires_grad=True), Parameter containing:
tensor([[ 0.2109, -0.2250, -0.0421]], requires_grad=True), Parameter containing:
tensor([-0.0520], requires_grad=True)]


# Printing Information about Network

In [328]:
list(model.named_parameters())

[('layers.0.weight',
  Parameter containing:
  tensor([[ 0.2975, -0.2548, -0.1119],
          [ 0.2710, -0.5435,  0.3462],
          [-0.1188,  0.2937,  0.0803]], requires_grad=True)),
 ('layers.0.bias',
  Parameter containing:
  tensor([-0.0707,  0.1601,  0.0285], requires_grad=True)),
 ('layers.1.weight',
  Parameter containing:
  tensor([[ 0.2109, -0.2250, -0.0421]], requires_grad=True)),
 ('layers.1.bias',
  Parameter containing:
  tensor([-0.0520], requires_grad=True))]

In [332]:
list(model.named_modules())

[('',
  Net(
    (layers): ModuleList(
      (0): Linear(in_features=3, out_features=3, bias=True)
      (1): Linear(in_features=3, out_features=1, bias=True)
    )
  )),
 ('layers',
  ModuleList(
    (0): Linear(in_features=3, out_features=3, bias=True)
    (1): Linear(in_features=3, out_features=1, bias=True)
  )),
 ('layers.0', Linear(in_features=3, out_features=3, bias=True)),
 ('layers.1', Linear(in_features=3, out_features=1, bias=True))]

In [333]:
list(model.named_children())

[('layers',
  ModuleList(
    (0): Linear(in_features=3, out_features=3, bias=True)
    (1): Linear(in_features=3, out_features=1, bias=True)
  ))]

In [334]:
list(model.named_buffers())     # return buffer tensors such as running mean average of a Batch Norm layer.

[]

# Different Learning Rates for Different layers

In [386]:
class Net(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.linear1 = nn.Linear(1,3)
        self.linear2 = nn.Linear(3,2)
        self.layer = nn.Sequential(nn.Linear(2,3),
                                   nn.Linear(3,1))

    def forward(self, x):
        x = self.linear1(x)
        x = self.linear2(x)
        return x
    
model = Net()
model

Net(
  (linear1): Linear(in_features=1, out_features=3, bias=True)
  (linear2): Linear(in_features=3, out_features=2, bias=True)
  (layer): Sequential(
    (0): Linear(in_features=2, out_features=3, bias=True)
    (1): Linear(in_features=3, out_features=1, bias=True)
  )
)

In [388]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.1983],
         [-0.3121],
         [ 0.1697]], requires_grad=True),
 Parameter containing:
 tensor([ 0.7738,  0.1432, -0.4728], requires_grad=True),
 Parameter containing:
 tensor([[0.5735, 0.0123, 0.4564],
         [0.1407, 0.5175, 0.3331]], requires_grad=True),
 Parameter containing:
 tensor([-0.2943,  0.4056], requires_grad=True),
 Parameter containing:
 tensor([[ 0.0045, -0.5676],
         [-0.1482, -0.6382],
         [-0.3664, -0.5009]], requires_grad=True),
 Parameter containing:
 tensor([ 0.4836, -0.1486, -0.1222], requires_grad=True),
 Parameter containing:
 tensor([[ 0.2767,  0.2350, -0.3810]], requires_grad=True),
 Parameter containing:
 tensor([0.2128], requires_grad=True)]

In [389]:
list(model.named_parameters())

[('linear1.weight',
  Parameter containing:
  tensor([[-0.1983],
          [-0.3121],
          [ 0.1697]], requires_grad=True)),
 ('linear1.bias',
  Parameter containing:
  tensor([ 0.7738,  0.1432, -0.4728], requires_grad=True)),
 ('linear2.weight',
  Parameter containing:
  tensor([[0.5735, 0.0123, 0.4564],
          [0.1407, 0.5175, 0.3331]], requires_grad=True)),
 ('linear2.bias',
  Parameter containing:
  tensor([-0.2943,  0.4056], requires_grad=True)),
 ('layer.0.weight',
  Parameter containing:
  tensor([[ 0.0045, -0.5676],
          [-0.1482, -0.6382],
          [-0.3664, -0.5009]], requires_grad=True)),
 ('layer.0.bias',
  Parameter containing:
  tensor([ 0.4836, -0.1486, -0.1222], requires_grad=True)),
 ('layer.1.weight',
  Parameter containing:
  tensor([[ 0.2767,  0.2350, -0.3810]], requires_grad=True)),
 ('layer.1.bias',
  Parameter containing:
  tensor([0.2128], requires_grad=True))]

In [390]:
model.state_dict()

OrderedDict([('linear1.weight',
              tensor([[-0.1983],
                      [-0.3121],
                      [ 0.1697]])),
             ('linear1.bias', tensor([ 0.7738,  0.1432, -0.4728])),
             ('linear2.weight',
              tensor([[0.5735, 0.0123, 0.4564],
                      [0.1407, 0.5175, 0.3331]])),
             ('linear2.bias', tensor([-0.2943,  0.4056])),
             ('layer.0.weight',
              tensor([[ 0.0045, -0.5676],
                      [-0.1482, -0.6382],
                      [-0.3664, -0.5009]])),
             ('layer.0.bias', tensor([ 0.4836, -0.1486, -0.1222])),
             ('layer.1.weight', tensor([[ 0.2767,  0.2350, -0.3810]])),
             ('layer.1.bias', tensor([0.2128]))])

In [377]:
# different learning rate for different layers

optim1 = SGD([{'params':model.linear1.parameters(), 'lr':0.001},
              {'params':model.layer.parameters()}],   # since lr is not specified for this group of parameters, use the lr given as input to optimizer
              lr=0.0001)

optim1

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.0001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [382]:
# different learning rate for weights and bias

optim2 = SGD([{'params':(param for name, param in model.named_parameters() if 'weight' in name), 'lr':0.001},
              {'params':(param for name, param in model.named_parameters() if 'bias' in name)}],
              lr=0.0001)

optim2

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.0001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [391]:
optim2.state_dict()     # hyperparameters of optimizer

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'maximize': False,
   'foreach': None,
   'differentiable': False,
   'params': [0, 1, 2, 3]},
  {'lr': 0.0001,
   'momentum': 0,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'maximize': False,
   'foreach': None,
   'differentiable': False,
   'params': [4, 5, 6, 7]}]}