In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
R_matrix = 0
O_matrix = 0

# Define a simple neural network structure
class SimpleConvNet(nn.Module):
    def __init__(self):
        super(SimpleConvNet, self).__init__()
        self.conv = nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=0, bias=False)
        self.fc = nn.Linear(4, 1, bias=False)  # Assuming the output of conv is flattened to 4 elements

    def forward(self, x):
        global R_matrix, O_matrix
        x = self.conv(x)
        R_matrix = x
        print("R matrix:", x)
        x = F.sigmoid(x)
        O_matrix = x
        print("O matrix:", x)
        x = torch.flatten(x, 1)
        print("Flatten:", x)
        x = self.fc(x)
        return x

# Initialize the model
model = SimpleConvNet()

# Define inputs
input_feature_map = torch.randn(1, 1, 4, 4, requires_grad=True)  # Batch size of 1, 1 channel, 4x4 feature map
true_label = torch.tensor([[[17.]]])  # Placeholder for 'yy'

# Forward pass
output = model(input_feature_map)


R matrix: tensor([[[[-0.0900, -0.4393],
          [ 0.2129,  0.3955]]]], grad_fn=<ConvolutionBackward0>)
O matrix: tensor([[[[0.4775, 0.3919],
          [0.5530, 0.5976]]]], grad_fn=<SigmoidBackward0>)
Flatten: tensor([[0.4775, 0.3919, 0.5530, 0.5976]], grad_fn=<ViewBackward0>)


In [2]:
print(output)
loss_fn = nn.MSELoss()
loss = loss_fn(output, true_label)
print(loss)
print(model.conv.weight.grad)

# Backward pass
model.zero_grad()
loss.backward()

# Gradients with respect to the weight and input feature
gradient_wrt_w11 = model.conv.weight.grad[0, 0, 1, 1]

print(f"Gradient with respect to w11: {gradient_wrt_w11}")
print(f"Gradient of the input feature map at position [0][0]: {input_feature_map.grad[0, 0, 0, 0]}")

tensor([[-0.1215]], grad_fn=<MmBackward0>)
tensor(293.1454, grad_fn=<MseLossBackward0>)
None
Gradient with respect to w11: 0.6260359287261963
Gradient of the input feature map at position [0][0]: -0.03959649056196213


  return F.mse_loss(input, target, reduction=self.reduction)


In [3]:
def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

In [4]:

# w11
value = 2*(output - true_label)
input_aa = [
    input_feature_map[0][0][1][1], input_feature_map[0][0][1][2],
    input_feature_map[0][0][2][1], input_feature_map[0][0][2][2],
    ]
temp = 0
for ii in range(4):
    ww = model.fc.weight[0][ii]
    rr = model.conv(input_feature_map)[0][0][ii//2][ii%2]
    aa = input_aa[ii]
    temp += ww*sigmoid_prime(rr)*aa
gradient = value*temp
print(f"Manual calculated gradient of w11: {gradient.item()}")
print(f"Gradient with respect to conv w11: {gradient_wrt_w11}")


Manual calculated gradient of w11: 0.6260358691215515
Gradient with respect to conv w11: 0.6260359287261963


In [5]:
# wa

value = 2*(output - true_label)
o00 = F.sigmoid(model.conv(input_feature_map)[0][0][0][0])
gradient_wa = value * o00
print(f"Manual calculated gradient of wa: {gradient_wa.item()}")
print(f"Gradient with respect to fc wa:   {model.fc.weight.grad[0][0]}")


Manual calculated gradient of wa: -16.351242065429688
Gradient with respect to fc wa:   -16.351242065429688


In [6]:
# a00

value = 2*(output - true_label)
wa = model.fc.weight[0][0]
rr = model.conv(input_feature_map)[0][0][0][0]
w00 = model.conv.weight[0][0][0][0]
# print(w00)
gradient_a00 = value * wa * sigmoid_prime(rr) * w00
print(f"Manual calculated gradient of a00: {gradient_a00.item()}")
print(f"Gradient of the input feature a00: {input_feature_map.grad[0, 0, 0, 0]}")


Manual calculated gradient of a00: -0.039596494287252426
Gradient of the input feature a00: -0.03959649056196213


In [6]:
from sympy import symbols, diff, exp, Function

# Define symbols
w11, a00, yy = symbols('w11 a00 yy')
x = symbols('x')
sigma = Function('sigma')(x)
y_pred = symbols('y_pred')  # Placeholder for the output of the network

# Define the sigmoid function and its derivative
sigma = 1 / (1 + exp(-x))
d_sigma_dx = diff(sigma, x)

# Assuming the output y_pred is a function of w11 and a00 through some operation, we need to define y_pred
# For simplicity, let's assume y_pred = sigma(w11 * a00), which is not the actual case but simplifies the gradient computation
# In reality, y_pred would depend on the entire convolution operation and all weights

# MSE Loss
mse = (y_pred - yy)**2 / 2
# Gradient of MSE loss w.r.t. y_pred
d_mse_dy_pred = diff(mse, y_pred)

# Let's simplify the gradients with the assumptions
# Assuming a direct relation for simplification: y_pred = sigma(w11 * a00)
y_pred_simple = sigma.subs(x, w11 * a00)

# Gradients of interest
d_loss_dw11 = diff(mse.subs(y_pred, y_pred_simple), w11)
d_loss_da00 = diff(mse.subs(y_pred, y_pred_simple), a00)

d_loss_dw11, d_loss_da00


(a00*(-yy + 1/(1 + exp(-a00*w11)))*exp(-a00*w11)/(1 + exp(-a00*w11))**2,
 w11*(-yy + 1/(1 + exp(-a00*w11)))*exp(-a00*w11)/(1 + exp(-a00*w11))**2)