In [112]:
import torch
import torch.nn as nn
import torch.nn.functional as F
R_matrix = 0
O_matrix = 0

# Define a simple neural network structure
class SimpleConvNet(nn.Module):
    def __init__(self):
        super(SimpleConvNet, self).__init__()
        self.conv = nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=0, bias=False)
        self.fc = nn.Linear(4, 1, bias=False)  # Assuming the output of conv is flattened to 4 elements

    def forward(self, x):
        global R_matrix, O_matrix
        x = self.conv(x)
        R_matrix = x
        print("R matrix:", x)
        x = F.sigmoid(x)
        O_matrix = x
        print("O matrix:", x)
        x = torch.flatten(x, 1)
        print("Flatten:", x)
        x = self.fc(x)
        return x

# Initialize the model
model = SimpleConvNet()

# Define inputs
input_feature_map = torch.randn(1, 1, 4, 4)  # Batch size of 1, 1 channel, 4x4 feature map
true_label = torch.tensor([[[17.]]])  # Placeholder for 'yy'

# Forward pass
output = model(input_feature_map)


R matrix: tensor([[[[-0.0665, -0.1705],
          [ 0.1903, -0.5043]]]], grad_fn=<ConvolutionBackward0>)
O matrix: tensor([[[[0.4834, 0.4575],
          [0.5474, 0.3765]]]], grad_fn=<SigmoidBackward0>)
Flatten: tensor([[0.4834, 0.4575, 0.5474, 0.3765]], grad_fn=<ViewBackward0>)


In [113]:
print(output)
loss_fn = nn.MSELoss()
loss = loss_fn(output, true_label)
print(loss)
print(model.conv.weight.grad)

# Backward pass
model.zero_grad()
loss.backward()
print(model.conv.weight.grad)

# Gradients with respect to the weight and input feature
gradient_wrt_w11 = model.conv.weight.grad[0, 0, 1, 1]
# gradient_wrt_a00 = input_feature_map.grad[0, 0, 0, 0]

print(f"Gradient with respect to w11: {gradient_wrt_w11}")
# print(f"Gradient with respect to input feature a00: {gradient_wrt_a00}")


tensor([[-0.3745]], grad_fn=<MmBackward0>)
tensor(301.8732, grad_fn=<MseLossBackward0>)
None
tensor([[[[-0.5801, -0.8634,  2.1364],
          [ 4.7123, -5.6404, -0.9008],
          [ 1.1170,  0.3789, -0.7894]]]])
Gradient with respect to w11: -5.6403937339782715


In [114]:
print(model.conv.weight.grad)
# print(model.fc.weight.grad)
# print(model.conv.weight)
# print(input_feature_map.grad)

tensor([[[[-0.5801, -0.8634,  2.1364],
          [ 4.7123, -5.6404, -0.9008],
          [ 1.1170,  0.3789, -0.7894]]]])


In [115]:
# print(model.conv.weight)
# print(model.fc.weight)
# print(model.fc.weight[0][0])
# print("==output",model.conv(input_feature_map))
# print("==sigmoid",F.sigmoid(model.conv(input_feature_map)))

# print(model.conv(input_feature_map)[0][0][1][1])
# print(input_feature_map[0][0][1][1])

In [116]:
def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

In [117]:
print(R_matrix, O_matrix)

tensor([[[[-0.0665, -0.1705],
          [ 0.1903, -0.5043]]]], grad_fn=<ConvolutionBackward0>) tensor([[[[0.4834, 0.4575],
          [0.5474, 0.3765]]]], grad_fn=<SigmoidBackward0>)


In [123]:

# w11
value = -2*(true_label - output)
input_aa = [
    input_feature_map[0][0][1][1], input_feature_map[0][0][1][2],
    input_feature_map[0][0][2][1], input_feature_map[0][0][2][2],
    ]
temp = 0
for ii in range(4):
    ww = model.fc.weight[0][ii]
    rr = model.conv(input_feature_map)[0][0][ii//2][ii%2]
    aa = input_aa[ii]
    temp += ww*sigmoid_prime(rr)*aa
value *= temp
print(value)
print(f"Gradient with respect to w11: {gradient_wrt_w11}")


tensor([[[-5.6404]]], grad_fn=<MulBackward0>)
Gradient with respect to w11: -5.6403937339782715


In [109]:


# Assuming the model and inputs are already defined
model.eval()  # Evaluation mode

# Forward pass through the convolutional layer
conv_output = model.conv(input_feature_map)
sigmoid_output = F.sigmoid(conv_output)

# Calculate the derivative of the loss w.r.t. the output
dL_do = 2 * (output - true_label)
dL_do = 2*(output - true_label)
print(dL_do)
# Forward pass through the rest of the network to get to the output
conv_output_flattened = torch.flatten(sigmoid_output, 1)
fc_output = model.fc(conv_output_flattened)

# Now, calculate the gradient of `weight[1][1]` in the conv layer
# Identify the contributions of weight[1][1] to the convolution output
# Note: This requires understanding which input pixels affect weight[1][1]

grad_w11 = 0
temp = 0
for i in range(2):  # Assuming a 4x4 input, the kernel affects positions [1, 2] for both i and j
    for j in range(2):
        # Get the derivative of the sigmoid activation w.r.t. the conv output
        dAct_dConv = sigmoid_prime(conv_output[0][0][i][j])
        # The input that corresponds to weight[1][1] for this position
        input_contrib = input_feature_map[0][0][i+1][j+1]
        
        # Gradient contribution for this position
        # Multiply by the derivative of the loss w.r.t. fc output (chain through fc weights)
        for ii in range(4):
            grad_w11 += (dL_do * model.fc.weight[0][ii] * dAct_dConv * input_contrib).item()
            temp += (model.fc.weight[0][ii] * dAct_dConv * input_contrib).item()

            # print(model.fc.weight[0][ii])

# This is a simplified and not fully accurate calculation,
# it illustrates the approach but skips over batch handling and complete chaining through the network
print(temp)
print(f"Manual gradient calculation of conv's weight[1][1]: {grad_w11}")


tensor([[[-33.5925]]], grad_fn=<MulBackward0>)
0.09723788453266025
Manual gradient calculation of conv's weight[1][1]: -3.266460955142975


In [None]:
value = 2*(true_label - output)
o00 = F.sigmoid(model.conv(input_feature_map)[0][0][0][0])
print(value * o00)

In [4]:
from sympy import symbols, diff, exp, Function

# Define symbols
w11, a00, yy = symbols('w11 a00 yy')
x = symbols('x')
sigma = Function('sigma')(x)
y_pred = symbols('y_pred')  # Placeholder for the output of the network

# Define the sigmoid function and its derivative
sigma = 1 / (1 + exp(-x))
d_sigma_dx = diff(sigma, x)

# Assuming the output y_pred is a function of w11 and a00 through some operation, we need to define y_pred
# For simplicity, let's assume y_pred = sigma(w11 * a00), which is not the actual case but simplifies the gradient computation
# In reality, y_pred would depend on the entire convolution operation and all weights

# MSE Loss
mse = (y_pred - yy)**2 / 2
# Gradient of MSE loss w.r.t. y_pred
d_mse_dy_pred = diff(mse, y_pred)

# Let's simplify the gradients with the assumptions
# Assuming a direct relation for simplification: y_pred = sigma(w11 * a00)
y_pred_simple = sigma.subs(x, w11 * a00)

# Gradients of interest
d_loss_dw11 = diff(mse.subs(y_pred, y_pred_simple), w11)
d_loss_da00 = diff(mse.subs(y_pred, y_pred_simple), a00)

d_loss_dw11, d_loss_da00


(a00*(-yy + 1/(1 + exp(-a00*w11)))*exp(-a00*w11)/(1 + exp(-a00*w11))**2,
 w11*(-yy + 1/(1 + exp(-a00*w11)))*exp(-a00*w11)/(1 + exp(-a00*w11))**2)

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

yy=10.

# Assume a00 and w11 are the parameters of interest
# Initialize input feature map and convolution kernel
input_feature = torch.randn(1, 1, 4, 4, requires_grad=True)  # 1 input channel, 4x4 size, requires_grad enabled for input
conv_kernel = torch.randn(1, 1, 3, 3, requires_grad=True)  # 1 output channel, 1 input channel, 3x3 kernel size
target = torch.tensor([yy])  # Assuming yy is a known scalar value

# Convolution operation
conv = F.conv2d(input_feature, conv_kernel, bias=None)
# Sigmoid activation
activated_output = torch.sigmoid(conv)
# Flatten
flattened_output = activated_output.view(-1)
# Assume a simple direct regression value from the flattened output for simplicity
# In practice, this would involve a fully connected layer with weights
regression_value = flattened_output.sum()

# MSE Loss
loss = F.mse_loss(regression_value.unsqueeze(0), target)  # Unsqueezing to match dimensions
# Backpropagate
loss.backward()

# Gradients
grad_w11 = conv_kernel.grad[0, 0, 1, 1]  # Gradient with respect to w11, the middle value of the kernel
grad_a00 = input_feature.grad[0, 0, 0, 0]  # Gradient with respect to a00, the top-left value of the input feature

grad_w11, grad_a00


(tensor(-1.0904), tensor(-0.0037))