In [57]:
import numpy as np

class Tensor:
  def __init__(self, data, _children=(), _op=''):
    # Allow Parameter objects to pass through (since Parameter inherits from Tensor)
    if hasattr(data, 'data') and hasattr(data, 'grad'):  # It's a Tensor-like object
      self.data = data.data if hasattr(data, 'data') else data
    elif isinstance(data, (int, float, np.ndarray)):
      if isinstance(data, (int, float)):
        self.data = np.array(data)
      else:
        self.data = data
    else:
      raise TypeError(f"Data must be a number or numpy array, got {type(data)}")
    self.grad = np.zeros_like(self.data, dtype=float)  #initialize gradiant
    self._backward = lambda:None
    self._op = _op
    self._prev = set(_children) # Set of input Tensors that created this Tensor
    self.is_parameter = False # Flag for identifying parameters

  def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad = self.grad + out.grad
            other.grad = other.grad + out.grad
        out._backward = _backward
        return out

  def __mul__(self, other):
    other = other if isinstance(other, Tensor) else Tensor(other)
    out = Tensor(self.data * other.data, (self, other), '*')

    def _backward():
      self.grad = self.grad + other.data * out.grad
      other.grad = other.grad + self.data * out.grad
    out._backward = _backward
    return out

  def __pow__(self, other):
    assert isinstance(other, (int, float)), "only supporting int/float powers for now"
    out = Tensor(self.data**other, (self,), f'**{other}')

    def _backward():
      self.grad = self.grad + (other * self.data**(other-1)) * out.grad
    out._backward = _backward
    return out

  def relu(self):
    out = Tensor(np.maximum(0, self.data), (self,), 'relu')

    def _backward():
      self.grad = self.grad + (out.data > 0) * out.grad
    out._backward = _backward
    return out

  # Basic matrix multiplication for neural networks
  def matmul(self, other):
    other = other if isinstance(other, Tensor) else Tensor(other)
    assert self.data.ndim == 2 and other.data.ndim == 2 and self.data.shape[1] == other.data.shape[0], f"Shape mismatch for matmul: {self.data.shape} @ {other.data.shape}"

    out = Tensor(self.data @ other.data, (self, other), '@')

    def _backward():
      self.grad = self.grad + out.grad @ other.data.T
      other.grad = other.grad + self.data.T @ out.grad
    out._backward = _backward
    return out

  def sum(self, axis=None, keepdims=False):
    out = Tensor(np.sum(self.data, axis=axis, keepdims=keepdims), (self,), 'sum')

    def _backward():
      if axis is not None and not keepdims:
        # Need to expand grad if sum reduced dimensions
        shape_tuple = tuple(1 if i == axis else self.data.shape[i] for i in range(self.data.ndim))
        self.grad = self.grad + np.reshape(out.grad, shape_tuple)
      else:
        self.grad = self.grad + out.grad
    out._backward = _backward
    return out

  def backward(self):
    # Topological sort for correct backpropagation order
    topo = []
    visited = set()
    def build_topo(v):
      if v not in visited:
        visited.add(v)
        for child in v._prev:
          build_topo(child)
        topo.append(v)
    build_topo(self)

    self.grad = np.ones_like(self.data, dtype=float) # Initialize gradient for the output
    for node in reversed(topo):
      node._backward()

  # Enable operations like -x, x-y, /x, x/y
  def __neg__(self): # -self
    return self * -1

  def __sub__(self, other): # self - other
    return self + (-other)

  def __rsub__(self, other): # other - self
    return other + (-self)

  def __truediv__(self, other): # self / other
    return self * (other**-1)

  def __rtruediv__(self, other): # other / self
    return other * (self**-1)

  def __repr__(self):
    return f"Tensor(data={self.data}, grad={self.grad})"

In [58]:
class Parameter(Tensor):
  def __init__(self, data):
    super().__init__(data)
    self.is_parameter = True

In [59]:
class Module(object):
  def __init__(self):
    self._parameters = {}  # Dictionary to hold parameters
    self._modules = {}     # Dictionary to hold sub-modules

  def __setattr__(self, name, value):
    if isinstance(value, Parameter):
      # print(f"  -> Identified as Parameter! Adding to _parameters['{name}']") # Debug print
      self._parameters[name] = value
      super().__setattr__(name, value)
    elif isinstance(value, Module):
      # print(f"  -> Identified as Module! Adding to _modules['{name}']") # Debug print
      self._modules[name] = value
      super().__setattr__(name, value)
    else:
      super().__setattr__(name, value)

  def __call__(self, *args, **kwargs):
    return self.forward(*args, **kwargs)

  def forward(self, *args, **kwargs):
    raise NotImplementedError          # must be implemented by subclasses

  def parameters(self):
    # yields all the parameters of this module and all the sub-modules recursively
    for name, param in self._parameters.items():
      yield param
    for name, module in self._modules.items():
      yield from module.parameters()

  def zero_grad(self):
    # Iterate over the parameters by calling the parameters() method
    for p in self.parameters():
      p.grad = np.zeros_like(p.data, dtype=float)

In [60]:
class Linear(Module):
  def __init__(self, in_features, out_features):
    super().__init__()
    limit = np.sqrt(1/in_features)
    # limit = np.sqrt(6.0 / (in_features + out_features)) # Xavier/Glorot initialization
    self.weight = Parameter(np.random.uniform(-limit, limit, (in_features, out_features)))
    self.bias = Parameter(np.random.uniform(0.0, 0.1, out_features))

  def forward(self, x):
    return x.matmul(self.weight) + self.bias

In [61]:
class ReLU(Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return x.relu()

In [62]:
class Sequential(Module):
  def __init__(self, *modules):
    super().__init__() # Initialize parent Module class
    for i, module in enumerate(modules):
      self._modules[str(i)] = module      # register sub-modules by index

  def forward(self, x):
    for module in self._modules.values():
      x = module(x)
    return x

In [63]:
class SGD():
  def __init__(self, parameters, lr):
    # Collect parameters from the generator into a list
    self.parameters = list(parameters)
    self.lr = lr

  def step(self):
    for p in self.parameters:
      # Ensure parameter has a gradient before updating
      if p.grad is not None:
        p.data = p.data - self.lr * p.grad

  def zero_grad(self):
    for p in self.parameters:
      p.grad = np.zeros_like(p.data, dtype=float)

In [64]:
def mse_loss(predictions, target):
  return (predictions - target)**2.0

Training Example

In [65]:
# Training data
X_train = Tensor(np.array([[1.0], [2.0], [3.0], [4.0]]))
y_train = Tensor(np.array([[2.0], [4.0], [6.0], [8.0]]))

# Model definition
model = Sequential(Linear(in_features=1, out_features=10), ReLU(), Linear(in_features=10, out_features=1))

# Optimizer
optimizer = SGD(model.parameters(), lr=0.01)

# Training loop
epochs=100
print("Starting training..")
for epoch in range(epochs):
  # forward pass
  predictions = model(X_train)

  # calculate loss
  loss = mse_loss(predictions, y_train).sum()

  # zero grad
  optimizer.zero_grad()

  # backward pass
  loss.backward()

  # update parameters
  optimizer.step()

  if (epoch + 1) % 10 == 0:
    print(f"Epoch {epoch+1}/{epochs}, Loss {loss.data.item():.4f}")

print("\nTraining complete!")
print("Learned parameters:")
for name, param in model._parameters.items():
  print(f"  {name}: {param.data}")
for name, sub_module in model._modules.items():
  if isinstance(sub_module, Linear):
    print(f"  Linear Layer {name} weights: {sub_module.weight.data}")
    print(f"  Linear Layer {name} biases: {sub_module.bias.data}")

Starting training..
Epoch 10/100, Loss 23.6878
Epoch 20/100, Loss 21.7195
Epoch 30/100, Loss 19.2039
Epoch 40/100, Loss 17.0257
Epoch 50/100, Loss 15.1081
Epoch 60/100, Loss 13.4183
Epoch 70/100, Loss 11.9274
Epoch 80/100, Loss 10.6107
Epoch 90/100, Loss 9.4465
Epoch 100/100, Loss 8.4176

Training complete!
Learned parameters:
  Linear Layer 0 weights: [[-0.2403281  -0.0414584   0.42834648  1.25953893 -0.81124179 -0.96802148
   0.56237794 -0.91466226 -0.09683355 -0.52302415]]
  Linear Layer 0 biases: [[ 0.00301466  0.10723951  0.0929473  -0.0046927   0.01843069  0.08433935
   0.01297196  0.09408899  0.05350213  0.02333097]
 [ 0.00301466  0.07586421  0.10598381  0.02293886  0.01843069  0.08433935
   0.02686618  0.09408899  0.05350213  0.02333097]
 [ 0.00301466  0.0780951   0.11781379  0.04728558  0.01843069  0.08433935
   0.03927733  0.09408899  0.05350213  0.02333097]
 [ 0.00301466  0.0780951   0.12964327  0.07163091  0.01843069  0.08433935
   0.05168785  0.09408899  0.05350213  0.0233

In [56]:
# Test 1: Simple multiplication
print("=== Test 1: Simple multiplication ===")
x = Tensor(2.0)
y = Tensor(3.0)
z = x * y
print(f"x: {x}")
print(f"y: {y}")
print(f"z: {z}")
print("\nCalling z.backward():")
z.backward()
print(f"\nAfter backward:")
print(f"x: {x}")
print(f"y: {y}")
print("Expected: x.grad=3.0, y.grad=2.0")

print("\n" + "="*50)

# Test 2: Your exact setup
print("=== Test 2: Your training setup ===")
# Single training example
X = Tensor(np.array([[1.0]]))
y_true = Tensor(np.array([[2.0]]))

# Single weight and bias (like a minimal linear layer)
W = Tensor(np.array([[0.5]]))  # Random weight
b = Tensor(np.array([0.1]))    # Small bias

print(f"X: {X}")
print(f"y_true: {y_true}")
print(f"W: {W}")
print(f"b: {b}")

# Forward pass: y_pred = X @ W + b
y_pred = X.matmul(W) + b
print(f"y_pred: {y_pred}")

# Loss: (y_pred - y_true)^2
diff = y_pred + (y_true * -1)  # y_pred - y_true
loss = diff ** 2
print(f"diff: {diff}")
print(f"loss: {loss}")

print("\nCalling loss.backward():")
loss.backward()

print(f"\nAfter backward:")
print(f"W.grad: {W.grad}")
print(f"b.grad: {b.grad}")

# Manual gradient check
print(f"\nManual verification:")
print(f"y_pred.data = {y_pred.data[0,0]}")
print(f"y_true.data = {y_true.data[0,0]}")
print(f"diff = {y_pred.data[0,0] - y_true.data[0,0]}")
print(f"Expected W.grad = 2 * diff * X = 2 * {y_pred.data[0,0] - y_true.data[0,0]} * {X.data[0,0]} = {2 * (y_pred.data[0,0] - y_true.data[0,0]) * X.data[0,0]}")
print(f"Expected b.grad = 2 * diff = 2 * {y_pred.data[0,0] - y_true.data[0,0]} = {2 * (y_pred.data[0,0] - y_true.data[0,0])}")

=== Test 1: Simple multiplication ===
x: Tensor(data=2.0, grad=0.0)
y: Tensor(data=3.0, grad=0.0)
z: Tensor(data=6.0, grad=0.0)

Calling z.backward():

After backward:
x: Tensor(data=2.0, grad=3.0)
y: Tensor(data=3.0, grad=2.0)
Expected: x.grad=3.0, y.grad=2.0

=== Test 2: Your training setup ===
X: Tensor(data=[[1.]], grad=[[0.]])
y_true: Tensor(data=[[2.]], grad=[[0.]])
W: Tensor(data=[[0.5]], grad=[[0.]])
b: Tensor(data=[0.1], grad=[0.])
y_pred: Tensor(data=[[0.6]], grad=[[0.]])
diff: Tensor(data=[[-1.4]], grad=[[0.]])
loss: Tensor(data=[[1.96]], grad=[[0.]])

Calling loss.backward():

After backward:
W.grad: [[-2.8]]
b.grad: [[-2.8]]

Manual verification:
y_pred.data = 0.6
y_true.data = 2.0
diff = -1.4
Expected W.grad = 2 * diff * X = 2 * -1.4 * 1.0 = -2.8
Expected b.grad = 2 * diff = 2 * -1.4 = -2.8
