In [3]:
import torch

def test_matmul():
    # Dimensions for matrix multiplication
    B, T, C, OC = 1, 3, 4, 2
    
    # Define inputs, weights, and biases with predefined values and set requires_grad
    x = torch.tensor([[0.1, -0.2, 0.3, 0.4],
                      [0.5, -0.6, 0.7, 0.8],
                      [0.9, 1.0, -1.1, 1.2]], dtype=torch.float32, requires_grad=True)
    w = torch.tensor([[0.1, 0.2, 0.3, 0.4],
                      [0.5, 0.6, 0.7, 0.8]], dtype=torch.float32, requires_grad=True)
    b = torch.tensor([0.1, -0.1], dtype=torch.float32, requires_grad=True)
    
    # Forward pass: Compute output
    y = torch.matmul(x, w.t()) + b

    # Create a specific gradient for the output y
    # dy = torch.ones_like(y)  # Example gradient, can be modified as needed 0.1, -0.2, -1.0, -0.8, 1.0, 1.0
    dy = torch.tensor([[0.1, -0.2],
                       [-1.0, -0.8],
                       [1.0, 1.0]], dtype=torch.float32)
    

    # Backward pass: Apply the specified gradient dy
    y.backward(gradient=dy)

    # Output results
    print("Input X:\n", x)
    print("Weight W:\n", w)
    print("Bias B:\n", b)
    print("Output Y:\n", y)
    print("Specified gradient for Y (dy):\n", dy)
    print("Gradient w.r.t. X (dx):\n", x.grad)
    print("Gradient w.r.t. Weights (dw):\n", w.grad)
    print("Gradient w.r.t. Biases (db):\n", b.grad)

if __name__ == "__main__":
    test_matmul()


Input X:
 tensor([[ 0.1000, -0.2000,  0.3000,  0.4000],
        [ 0.5000, -0.6000,  0.7000,  0.8000],
        [ 0.9000,  1.0000, -1.1000,  1.2000]], requires_grad=True)
Weight W:
 tensor([[0.1000, 0.2000, 0.3000, 0.4000],
        [0.5000, 0.6000, 0.7000, 0.8000]], requires_grad=True)
Bias B:
 tensor([ 0.1000, -0.1000], requires_grad=True)
Output Y:
 tensor([[0.3200, 0.3600],
        [0.5600, 0.9200],
        [0.5400, 1.1400]], grad_fn=<AddBackward0>)
Specified gradient for Y (dy):
 tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])
Gradient w.r.t. X (dx):
 tensor([[0.6000, 0.8000, 1.0000, 1.2000],
        [0.6000, 0.8000, 1.0000, 1.2000],
        [0.6000, 0.8000, 1.0000, 1.2000]])
Gradient w.r.t. Weights (dw):
 tensor([[ 1.5000,  0.2000, -0.1000,  2.4000],
        [ 1.5000,  0.2000, -0.1000,  2.4000]])
Gradient w.r.t. Biases (db):
 tensor([3., 3.])


In [9]:
import torch

def test_layernorm():
    # Dimensions for LayerNorm
    B, T, C = 1, 3, 4
    
    # Define inputs, weights, and biases with predefined values and set requires_grad
    x = torch.tensor([[0.1, -0.2, 0.3, 0.4],
                      [0.5, -0.6, 0.7, 0.8],
                      [0.9, 1.0, -1.1, 1.2]], dtype=torch.float32, requires_grad=True)
    
    # Forward pass: Compute output
    y = torch.nn.LayerNorm(3)
    y = y(x)
    print(y)

    # Create a specific gradient for the output y
    dy = torch.tensor([[0.1, -0.2, 0.3, 0.4],
                       [0.5, -0.6, 0.7, 0.8],
                       [0.9, 1.0, -1.1, 1.2]], dtype=torch.float32)
    
    # Backward pass: Apply the specified gradient dy
    y.backward(gradient=dy)

    # Output results
    print("Input X:\n", x)
    print("Output Y:\n", y)
    print("Specified gradient for Y (dy):\n", dy)
    print("Gradient w.r.t. X (dx):\n", x.grad)

In [13]:
# Image Example
N, C, H, W = 20, 5, 10, 10
input = torch.randn(N, C, H, W)
# Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
# as shown in the image below
layer_norm = torch.nn.LayerNorm([C, H, W])
output = layer_norm(input)

print(output.shape)  # torch.Size([20, 5, 10, 10])
print(layer_norm.weight.shape)  # torch.Size([5])

torch.Size([20, 5, 10, 10])
torch.Size([5, 10, 10])
