In [21]:
import torch
import torch.nn as nn

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [22]:
from typing import OrderedDict


class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        # self.layers = nn.ModuleList([
        #     nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
        #     nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
        #     nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
        #     nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
        #     nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        # ])

        self.layers = nn.ModuleList([
            nn.Sequential(OrderedDict([
                ("linear", nn.Linear(layer_sizes[0], layer_sizes[1])), 
                ("activation", GELU()),
            ]))
            # nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            # nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            # nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            # nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])

    def forward(self, x):
        for layer in self.layers:
            # Compute the output of the current layer
            layer_output = layer(x)
            # Check if shortcut can be applied
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

In [23]:
def print_gradients(model, x):
    # Forward pass
    output = model(x)
    target = torch.tensor([[0.]])

    # Calculate loss based on how close the target
    # and output are
    loss = nn.MSELoss()
    loss = loss(output, target)
    print(f"output: {output}. target : {target} loss:{loss}")
    
    # Backward pass to calculate the gradients
    loss.backward()

    for name, param in model.named_parameters():
        print(f"name:{name}")
        
        if 'weight' in name:
            # Print the mean absolute gradient of the weights
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [24]:
layer_sizes = [3, 3, 3, 3, 3, 1]  

sample_input = torch.tensor([[1., 0., -1.]])

torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(
    layer_sizes, use_shortcut=False
)
print_gradients(model_without_shortcut, sample_input)

output: tensor([[ 0.1948, -0.1414,  0.0909]], grad_fn=<MulBackward0>). target : tensor([[0.]]) loss:0.02207360230386257
name:layers.0.linear.weight
layers.0.linear.weight has gradient mean of 0.03372581675648689
name:layers.0.linear.bias
