In [1]:
import torch
import torch.nn as nn

In [2]:
class ManualModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(10, 20)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(20, 5)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

In [3]:
seq_model = nn.Sequential(
    nn.Linear(10, 20),
    nn.ReLU(),
    nn.Linear(20, 5)
)

In [4]:
manual_model = ManualModel()

In [5]:
x = torch.randn(3, 10)
manual_out = manual_model(x)
seq_out = seq_model(x)

print(f"Manual output shape: {manual_out.shape}")
print(f"Sequential output shape: {seq_out.shape}")

Manual output shape: torch.Size([3, 5])
Sequential output shape: torch.Size([3, 5])


In [6]:
print("Does seq_model have a forward method?", hasattr(seq_model, 'forward'))
print("\nWhat is seq_model?", type(seq_model))

Does seq_model have a forward method? True

What is seq_model? <class 'torch.nn.modules.container.Sequential'>


In [7]:
# When you call seq_model(x), Python calls seq_model.forward(x) automatically
# Let's verify they're the same:
x = torch.randn(2, 10)

output1 = seq_model(x)  # This calls __call__ which calls forward
output2 = seq_model.forward(x)  # Direct call to forward

print("\nAre they the same?", torch.allclose(output1, output2))


Are they the same? True


In [8]:
class ModelWithSkipConnection(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(10, 10)
        self.layer2 = nn.Linear(10, 10)
    
    def forward(self, x):
        # Can't do this with Sequential!
        identity = x  # Save input
        x = self.layer1(x)
        x = self.layer2(x)
        x = x + identity  # Add skip connection
        return x

# Try it
model = ModelWithSkipConnection()
x = torch.randn(2, 10)
output = model(x)
print(f"Output shape: {output.shape}")


Output shape: torch.Size([2, 10])


In [10]:
seq_model_2 = nn.Sequential(
    nn.Linear(10, 10),
    nn.Linear(10, 10)
)

seq_out_2 = seq_model_2(x) + x

print(seq_out_2.shape)

torch.Size([2, 10])


In [11]:
print(model)

ModelWithSkipConnection(
  (layer1): Linear(in_features=10, out_features=10, bias=True)
  (layer2): Linear(in_features=10, out_features=10, bias=True)
)


In [12]:
print(seq_model_2)

Sequential(
  (0): Linear(in_features=10, out_features=10, bias=True)
  (1): Linear(in_features=10, out_features=10, bias=True)
)


In [13]:
# Test: Does autograd work with outside skip connection?
seq_model_2 = nn.Sequential(
    nn.Linear(10, 10),
    nn.Linear(10, 10)
)

x = torch.randn(2, 10, requires_grad=True)

# Both work with autograd!
out1 = seq_model_2(x) + x  # Skip outside
out2 = ModelWithSkipConnection()(x)  # Skip inside

loss1 = out1.sum()
loss2 = out2.sum()

loss1.backward(retain_graph=True)
print("Gradients work with outside skip:", x.grad is not None)

x.grad = None  # Reset
loss2.backward()
print("Gradients work with inside skip:", x.grad is not None)

# The REAL problem: Imagine using your model in a larger system
class BiggerModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Can't use seq_model_2 cleanly here!
        # We'd have to remember to add the skip in forward
        self.feature_extractor = seq_model_2
        self.classifier = nn.Linear(10, 2)
    
    def forward(self, x):
        # Easy to forget the + x !
        features = self.feature_extractor(x)  # Oops, missing + x
        return self.classifier(features)

print("\nSee the problem? The skip connection is easy to forget!")

Gradients work with outside skip: True
Gradients work with inside skip: True

See the problem? The skip connection is easy to forget!


In [14]:
seq_model_2

Sequential(
  (0): Linear(in_features=10, out_features=10, bias=True)
  (1): Linear(in_features=10, out_features=10, bias=True)
)

In [21]:

# Test: Does autograd work with outside skip connection?
seq_model_3 = nn.Sequential(
    nn.Linear(10, 10),
    nn.Linear(10, 10)
)

print("Before freezing:")
print(f"Layer 0 requires_grad: {seq_model_3[0].weight.requires_grad}")
print(f"Layer 1 requires_grad: {seq_model_3[1].weight.requires_grad}")

print("Layer 0 parameters:")
for name, param in seq_model_3.named_parameters():
    print(f"  {name}: shape {param.shape}, requires_grad={param.requires_grad}")

print("\nFreezing layer 0...")
for param in seq_model_3.parameters():
    param.requires_grad = False

print("\nAfter freezing:")
for name, param in seq_model_3.named_parameters():
    print(f"  {name}: shape {param.shape}, requires_grad={param.requires_grad}")

print("\nAfter freezing:")
print(f"Layer 0 requires_grad: {seq_model_3[0].weight.requires_grad}")
print(f"Layer 1 requires_grad: {seq_model_3[1].weight.requires_grad}")

Before freezing:
Layer 0 requires_grad: True
Layer 1 requires_grad: True
Layer 0 parameters:
  0.weight: shape torch.Size([10, 10]), requires_grad=True
  0.bias: shape torch.Size([10]), requires_grad=True
  1.weight: shape torch.Size([10, 10]), requires_grad=True
  1.bias: shape torch.Size([10]), requires_grad=True

Freezing layer 0...

After freezing:
  0.weight: shape torch.Size([10, 10]), requires_grad=False
  0.bias: shape torch.Size([10]), requires_grad=False
  1.weight: shape torch.Size([10, 10]), requires_grad=False
  1.bias: shape torch.Size([10]), requires_grad=False

After freezing:
Layer 0 requires_grad: False
Layer 1 requires_grad: False
