**Adding shortcut connections**

In [19]:
import torch
import torch.nn as nn

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))*(x+0.044715*torch.pow(x, 3))))
    

**A neural network to illustrate shortcut connections**

In [6]:

class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut=use_shortcut
        self.layers=nn.ModuleList([
            # 5 layers
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]),GELU()),
            
        ])

    def forward(self,x):
        for layer in self.layers:
            layer_output=layer(x) # compute the output of current layer
            # check if the shortcut can be applied
            # self.use_shortcut must be True for shortcut connections
            if self.use_shortcut and x.shape == layer_output.shape:
                x=x+layer_output
            else:
                x=layer_output
        return x
    



In [18]:
# for a nn that accepts 3 input values and return 3 output values
# according to above given condition for shortcut only 5 3's will go, 1 won't
layer_sizes=[3,3,3,3,3,1]
sample_input=torch.tensor([[1., 0., -1.]])

# print(sample_input.shape)

In [None]:
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(
    layer_sizes=layer_sizes, use_shortcut=False)


*Implementing a function that computes the gradients in the model's backward pass*

In [9]:
def print_gradients(model,x):

    # forward pass
    output=model(x)
    target=torch.tensor([[0.]])

    # calculating loss
    loss=nn.MSELoss()
    loss=loss(output, target)

    # backpropagation
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")



In [11]:
print_gradients(model_without_shortcut, sample_input)


layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.0001201116101583466
layers.2.0.weight has gradient mean of 0.0007152041653171182
layers.3.0.weight has gradient mean of 0.001398873864673078
layers.4.0.weight has gradient mean of 0.005049646366387606


In [16]:
torch.manual_seed(123)
model_with_shortcut=ExampleDeepNeuralNetwork(layer_sizes=layer_sizes, use_shortcut=True)


In [17]:
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.2216978669166565
layers.1.0.weight has gradient mean of 0.20694100856781006
layers.2.0.weight has gradient mean of 0.3289698660373688
layers.3.0.weight has gradient mean of 0.2665731906890869
layers.4.0.weight has gradient mean of 1.3258538246154785
