In [1]:
from lib.Tensor import Tensor
import numpy as np
from typing import List

In [69]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, input_data):
        """
        Compute the output of this layer using `input_data`.
        """

        raise NotImplementedError
    
    def backward(self, output_gradient):
        """
        Compute the input gradient using `output_gradient` and
        chain it with the local gradient.
        """
        raise NotImplementedError
    
    def __call__(self, input_data):
        """
        A convenient way to chain operations.
        """
        return self.forward(input_data)

In [117]:
class Dense(Layer):
    def __init__(self, input_dim: int, output_dim: int, lr: float = 0.01):
        self.weights = self.init_weights(input_dim, output_dim)
        self.bias = self.init_bias(output_dim)
        self.lr = lr
        self.input = None
    
    # initialize weights and bias
    def init_weights(self, input_dim: int, output_dim: int) -> Tensor:
        arr = np.random.randn(input_dim, output_dim) * 0.01
        return Tensor(arr, requires_grad=True)
    
    def init_bias(self, output_dim: int) -> Tensor:
        arr = np.zeros((1, output_dim))
        return Tensor(arr, requires_grad=True)
    
    # forward pass
    def forward(self, input_data: Tensor) -> Tensor:
        """
        Take an input tensor, multiply it with the weights and add the bias.
        
        X is a matrix of shape (batch_size, input_dim)
        W is a matrix of shape (input_dim, output_dim)
        b is a matrix of shape (1, output_dim)

        output = X @ W + b, matrix of shape (batch_size, output_dim)
        """
        self.input = input_data # save input for backward pass
        # ?not sure if i need to handle the case where batch_size = 1, and input_data is a vector
        self.output = input_data @ self.weights.T + self.bias # matrix multiplication
        return self.output
    
    # backward pass
        # backward pass
    def backward(self) -> None:
        """
        Compute and store gradients for weights and biases based on the output_gradient
        coming from the next layer. Then, compute the gradient for the inputs to be
        sent to the previous layer.

        output_gradient is a tensor of shape (batch_size, output_dim)
        """

        if self.output.grad is None:
            raise RuntimeError("No gradient found. You might need to call backward on the loss Tensor first.")
        
        # compute gradient for weights

In [118]:
class SGD:
    """
    Stochastic gradient descent optimizer.
    """
    def __init__(self, params: List[Tensor], lr: float = 0.01):
        self.params = params # a list of Tensors
        self.lr = lr

    def step(self):
        for param in self.params:
            if param.requires_grad:
                param.data -= self.lr * param.grad
                param.zero_grad()

    def zero_grad(self):
        for param in self.parameters:
            if param.requires_grad:
                param.zero_grad()

In [119]:
layer = Dense(1, 1)

In [130]:
# input_data = Tensor(np.random.randn(1), requires_grad=True) # (batch_size, input_dim)
input_data = np.array(1)
target = Tensor(np.random.randn(1), requires_grad=True) # (batch_size, output_dim)


In [131]:
print(f"weights: {layer.weights}, shape {layer.weights.shape}")
print(f"bias: {layer.bias}, shape {layer.bias.shape}")
print(f"Input data: {input_data}, shape {input_data.shape}")
print(f"Target: {target}, shape {target.shape}")

weights: Tensor([[-0.00544514]], requires_grad=True), shape (1, 1)
bias: Tensor([[0.]], requires_grad=True), shape (1, 1)
Input data: 1, shape ()
Target: Tensor([0.70778891], requires_grad=True), shape (1,)


In [132]:
layer.weights @ input_data + layer.bias

ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

In [78]:
# Initialize the optimizer
optimizer = SGD([layer.weights, layer.bias], lr=0.01)

In [89]:
print("Weights shape:", layer.weights.shape)
print("Bias shape:", layer.bias.shape)
print("Input shape:", input_data.shape)

Weights shape: (2, 1)
Bias shape: (1, 1)
Input shape: (2, 1)


In [91]:
layer.weights @ input_data.T + layer.bias

Tensor([[-0.00108276 -0.00110982]
 [-0.00262273 -0.00268827]], requires_grad=True)

In [80]:
output = layer(input_data) # forward pass
print(f"Output {output}, shape {output.shape}")
# loss = ((output - target) ** 2).sum() # compute loss
# print(f"Loss {loss}, shape {loss.shape}")

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)

In [62]:
loss.backward() # backward pass

Grad: 1.0
Self.grad: 0.0
Im in the backward_sum function now
self.grad: 1.0
self.data: 0.7029554252960425
self.grad * np.ones_like(self.data): 1.0
Grad: 1.0
Self.grad: [[0.]]
Grad: [[-1.67684874]]
Self.grad: [[0.]]
Grad: [[-1.67684874]]
Self.grad: [[0.]]
Grad: [[-1.67684874]]
Self.grad: [0.]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1 is different from 2)

In [49]:
optimizer.step()

In [50]:
optimizer.params

[Tensor([[2920.47471859]
  [1752.29304344]], requires_grad=True),
 Tensor([[1130.50583928]], requires_grad=True)]