<a href="https://colab.research.google.com/github/andrews/pytorch_tutorials/blob/main/d2i_modules_builders_guide.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dive into Deep Learning
d2l.ai

## Installation

In [1]:
import torch
torch.__version__
# torch.cuda.current_device()
# torch.cuda.get_device_name(0)

'2.3.0+cu121'

In [3]:
import torch
from torch import nn
from torch.nn import functional as F

In [4]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

X = torch.rand(2, 20)



In [6]:
n = net(X)

In [7]:
n.shape

torch.Size([2, 10])

In [9]:
class MLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden = nn.LazyLinear(256)
    self.out = nn.LazyLinear(10)

  def forward(self, X):
    return self.out(F.relu(self.hidden(X)))

In [11]:
# MLP is now a module, and can be used as a layer or component in other networks/modules, and even subclassed itself.
m = MLP()
m(X).shape



torch.Size([2, 10])

In [13]:
# under the hood of how Sequential class actually works. Daisy-chains the modules together
class MySequential(nn.Module):
  def __init__(self, *args):
    super().__init__()
    for idx, module in enumerate(args):
      self.add_module(str(idx), module)

  def forward(self, X):
    for module in self.children():
      X = module(X)
    return X

In [14]:
ms = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
ms(X).shape



torch.Size([2, 10])

In [16]:
# random constant added
class FixedMLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.rand_weight = torch.rand((20, 20))
    self.linear = nn.LazyLinear(20)

  def forward(self, X):
    X = self.linear(X)
    X = F.relu(X @ self.rand_weight + 1)
    # reuse. pass it through the linear layer again
    # as a result the model's weights are also used twice
    X = self.linear(X)
    # control flow
    while X.abs().sum() > 1:
      X /= 2
    return X.sum() # a scalar?? I suppose if it's the last layer

In [17]:
fmlp = FixedMLP()
fmlp(X)



tensor(0.0759, grad_fn=<SumBackward0>)

In [19]:
# shapeless, scalar
# if it was [x, 1], similar to previous explanation, x would be sample size
# 1 would be the output size of each sample
fmlp(X).shape

torch.Size([])

In [22]:
# all still linear
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.LazyLinear(64), nn.ReLU(),
                                 nn.LazyLinear(32), nn.ReLU())
        self.linear = nn.LazyLinear(16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.LazyLinear(20), FixedMLP())
chimera(X)



tensor(0.0502, grad_fn=<SumBackward0>)

In [25]:
# chapter question
class ParallelMLP(nn.Module):
  def __init__(self, net1, net2):
    super().__init__()
    self.hnet1 = net1
    self.hnet2 = net2
    self.linear = nn.LazyLinear(20, 10)

  def forward(self, X1, X2):
    h1o = self.hnet1(X1)
    h2o = self.hnet2(X2)
    return self.linear(torch.cat((h1o, h2o), dim=1))

In [26]:
pm = ParallelMLP(MLP(), MLP())
pm(X, X)

tensor([[ 0.0847, -0.0235,  0.2173, -0.1349, -0.0194,  0.1340,  0.0129, -0.1180,
          0.1877, -0.2754, -0.2155, -0.0283,  0.0038, -0.1234,  0.3625, -0.2176,
          0.1989,  0.1126, -0.2849,  0.0579],
        [ 0.0672, -0.0375,  0.2277, -0.0960,  0.0018,  0.1173, -0.0297, -0.1408,
          0.1618, -0.3578, -0.1740, -0.0145, -0.0349, -0.0846,  0.3938, -0.2156,
          0.1601,  0.1826, -0.3229, -0.0162]], grad_fn=<AddmmBackward0>)