In [5]:
import torch
a = torch.tensor([1,2,3])
b = torch.tensor([2,3,4])

torch.sum((a-b)**2)/3

tensor(1.)

In [8]:
import torch

# --- PyTorch Muscle Memory Challenge ---

def linear_forward_pass_and_backward(X: torch.Tensor, Y: torch.Tensor, W: torch.Tensor, B: torch.Tensor) -> tuple[float, torch.Tensor]:
    """
    Performs a single layer linear forward pass (Y_pred = X @ W + B), 
    calculates the Mean Squared Error (MSE) loss, and computes gradients.

    Args:
        X (torch.Tensor): Input features, shape (N, D_in).
        Y (torch.Tensor): Actual target values, shape (N, D_out).
        W (torch.Tensor): Weight matrix, shape (D_in, D_out).
        B (torch.Tensor): Bias vector, shape (D_out,).

    Returns:
        tuple[float, torch.Tensor]: The scalar MSE loss, and the calculated 
                                    gradient of the loss with respect to W (dL/dW).
    """
    # 1. Linear Forward Pass
    # Compute the matrix multiplication X @ W and add the bias B.
    # Hint: B will be automatically broadcast across the N dimension.
    Y_pred = X @ W + B

    # 2. Loss Calculation (Mean Squared Error, MSE)
    # Calculate the element-wise squared difference, sum it, and divide by the number of elements.
    # We want a scalar loss value.
    N = X.size(0) # Number of samples
    loss = torch.sum((Y_pred - Y)**2)/N

    # 3. Automatic Differentiation (Backward Pass)
    # Compute the gradients of the loss with respect to all tensors that require_grad (W and B).
    # You must first zero out any existing gradients (optional but good practice).
    # Then, call the .backward() method on the scalar loss.
    # loss.requires_grad()
    loss.backward()
    loss.zero_()

    # 4. Return results
    # Convert the loss tensor to a Python float using .item()
    return loss.item(), W.grad


# --- Setup and Execution ---
if __name__ == "__main__":
    # Define hyper-parameters and dimensions
    N, D_in, D_out = 10, 5, 2  # N=Samples, D_in=Input features, D_out=Output features

    # Create dummy input data and target data
    X_data = torch.randn(N, D_in)
    Y_target = torch.randn(N, D_out)

    # Initialize W and B, requiring gradients for optimization
    W_param = torch.randn(D_in, D_out, requires_grad=True)
    B_param = torch.randn(D_out, requires_grad=True)

    print("--- PyTorch Challenge Execution ---")
    print(f"X shape: {X_data.shape}, Y shape: {Y_target.shape}")
    print(f"W shape: {W_param.shape}, B shape: {B_param.shape}\n")

    # Run the function
    final_loss, grad_W = linear_forward_pass_and_backward(X_data, Y_target, W_param, B_param)

    # --- Verification and Output ---
    print(f"1. Final Scalar MSE Loss: {final_loss:.4f}")
    
    # Check if gradients were computed
    if W_param.grad is not None and B_param.grad is not None:
        print(f"2. Gradient dL/dW computed successfully. Shape: {W_param.grad.shape}")
        print(f"3. Gradient dL/dB computed successfully. Shape: {B_param.grad.shape}")
        
        # Display small sample of the computed gradient
        print(f"   Sample dL/dW (top-left):\n{W_param.grad[:2, :2]}")
    else:
        print("Gradients were not successfully computed.")

--- PyTorch Challenge Execution ---
X shape: torch.Size([10, 5]), Y shape: torch.Size([10, 2])
W shape: torch.Size([5, 2]), B shape: torch.Size([2])

1. Final Scalar MSE Loss: 0.0000
2. Gradient dL/dW computed successfully. Shape: torch.Size([5, 2])
3. Gradient dL/dB computed successfully. Shape: torch.Size([2])
   Sample dL/dW (top-left):
tensor([[-2.6822, -0.4118],
        [ 3.9925, -4.0314]])
