In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleConvNet(nn.Module):
    def __init__(self):
        super(SimpleConvNet, self).__init__()
        self.conv = nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=0, bias=False)
        self.fc = nn.Linear(4, 1, bias=False)  # Assuming the output of conv is flattened to 4 elements

    def forward(self, x):
        x = F.sigmoid(self.conv(x))
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Initialize the model
model = SimpleConvNet()

# Define inputs
input_feature_map = torch.randn(1, 1, 4, 4, requires_grad=True)  # Batch size of 1, 1 channel, 4x4 feature map
true_label = torch.tensor([[17.]], requires_grad=False)  # Placeholder for 'yy'

# Forward pass
output = model(input_feature_map)

# Calculate the loss
loss_fn = nn.MSELoss()
loss = loss_fn(output, true_label)

# Backward pass to compute gradients
loss.backward()

# Directly access the gradient of the first weight of the fc layer
gradient_wrt_fc1 = model.fc.weight.grad[0][0]
print(f"Gradient with respect to the first weight of fc: {gradient_wrt_fc1}")


Gradient with respect to the first weight of fc: -16.440515518188477


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

# Assuming model is your SimpleConvNet and has been defined and initialized
model.eval()  # Set the model to evaluation mode

# Forward pass to get the activations right before the fc layer
conv_output = F.sigmoid(model.conv(input_feature_map))
conv_output_flattened = torch.flatten(conv_output, 1)

# Manual calculation of the gradient of the first fc layer weight
# Calculate dL/do = 2 * (o - y) since it's a single output node for MSE loss
dL_do = 2 * (output - true_label)

# The input to the fc layer's first weight is the corresponding element of the flattened conv output
do_dw = conv_output_flattened[0][0]  # derivative of the output w.r.t. the weight is the input itself

# Gradient w.r.t the first weight is the product of these derivatives
gradient_wrt_fc1_manual = dL_do * do_dw

print(f"Manual gradient calculation of fc1's first weight: {gradient_wrt_fc1_manual.item()}")

# Compare with PyTorch's computed gradient
print(f"PyTorch gradient of fc1's first weight: {model.fc.weight.grad[0][0].item()}")


Manual gradient calculation of fc1's first weight: -25.68474769592285
PyTorch gradient of fc1's first weight: -25.68474769592285


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Assuming the model and inputs are already defined
model.eval()  # Evaluation mode

# Forward pass through the convolutional layer
conv_output = model.conv(input_feature_map)
sigmoid_output = F.sigmoid(conv_output)

# Calculate the derivative of the loss w.r.t. the output
dL_do = 2 * (output - true_label)

# Forward pass through the rest of the network to get to the output
conv_output_flattened = torch.flatten(sigmoid_output, 1)
fc_output = model.fc(conv_output_flattened)

# Now, calculate the gradient of `weight[1][1]` in the conv layer
# Identify the contributions of weight[1][1] to the convolution output
# Note: This requires understanding which input pixels affect weight[1][1]

grad_w11 = 0
for i in range(2):  # Assuming a 4x4 input, the kernel affects positions [1, 2] for both i and j
    for j in range(2):
        # Get the derivative of the sigmoid activation w.r.t. the conv output
        dAct_dConv = sigmoid_prime(conv_output[0][0][i][j])
        
        # The input that corresponds to weight[1][1] for this position
        input_contrib = input_feature_map[0][0][i+1][j+1]
        
        # Gradient contribution for this position
        # Multiply by the derivative of the loss w.r.t. fc output (chain through fc weights)
        for ii in range(4):
            grad_w11 += (dL_do * model.fc.weight[0][ii] * dAct_dConv * input_contrib).item()

# This is a simplified and not fully accurate calculation,
# it illustrates the approach but skips over batch handling and complete chaining through the network
print(f"Manual gradient calculation of conv's weight[1][1]: {grad_w11}")

# Compare with PyTorch's computed gradient
# model.zero_grad()
loss.backward()
print(f"PyTorch gradient of conv's weight[1][1]: {model.conv.weight.grad[0][0][1][1].item()}")


Manual gradient calculation of conv's weight[1][1]: 2.956057980656624
PyTorch gradient of conv's weight[1][1]: 0.40257540345191956
