In [31]:
import torch

In [32]:
# Creating tensor objects with requires_grad=True to enable autograd
# This mimics the Value class from micrograd, allowing us to track computations
x1 = torch.Tensor([2.0]).double()                ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double()                ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double()               ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double()                ; w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double()  ; b.requires_grad = True

# Forward pass: Constructing the computational graph
# This is similar to how we built expressions in micrograd
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

# Printing the output value, using .data to access the raw tensor without gradient tracking
print(o.data.item())

# Backward pass: Computing gradients
# This is equivalent to calling .backward() on the loss in micrograd
o.backward()

print('---')
# Printing gradients
# In micrograd, we accessed .grad directly on Value objects
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

0.7071066904050358
---
x2 0.5000001283844369
w2 0.0
x1 -1.5000003851533106
w1 1.0000002567688737


In [33]:
class Neuron:
    def __init__(self, nin):
        # Initializing weights and bias with random values
        # This is similar to how PyTorch initializes parameters
        # In micrograd, we used Value objects for weights and biases
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))
    
    def __call__(self, x):
        # Forward pass of a neuron: w * x + b
        # This is the same mathematical operation we did in micrograd
        # Using sum() with a generator expression for efficiency
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        out = act.tanh()
        return out
    
    def parameters(self):
        # Returning all parameters (weights and bias) of the neuron
        # This mimics PyTorch's way of accessing parameters
        return self.w + [self.b]

class Layer:
    def __init__(self, nin, nout):
        # Creating 'nout' neurons, each with 'nin' inputs
        # This structure is similar to PyTorch's nn.Linear layer
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x):
        # Forward pass of the layer: compute output for each neuron
        # This is equivalent to a matrix multiplication in PyTorch
        outs = [n(x) for n in self.neurons]
        # If there's only one output, return it directly instead of a list
        # This helps in creating the final layer of the network
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self):
        # Collecting parameters from all neurons in the layer
        # This flattens the list of parameters, similar to PyTorch's approach
        return [p for neuron in self.neurons for p in neuron.parameters()]

class MLP:
    def __init__(self, nin, nouts):
        # Creating layers based on input size and list of output sizes
        # This allows for flexible network architectures
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    def __call__(self, x):
        # Forward pass through all layers
        # This is similar to PyTorch's Sequential container
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        # Collecting parameters from all layers
        # This mimics PyTorch's way of gathering all parameters for optimization
        return [p for layer in self.layers for p in layer.parameters()]


In [34]:
# Example usage of the MLP
# Creating a network with 3 inputs, two hidden layers of 4 neurons each, and 1 output
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

Value(data=0.9454904521169566)

In [35]:
# Training data
# This is a simple dataset for binary classification
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0]  # desired targets

In [36]:
# Training loop
# This demonstrates the full process of forward pass, loss calculation, backpropagation, and update
for k in range(20):
    
    # Forward pass
    # ypred = [n(x) for x in xs]
    # This line performs the forward pass of the entire neural network for each input in the dataset
    # 1. It iterates over each input 'x' in the training data 'xs'
    # 2. For each 'x', it calls the neural network 'n(x)', which:
    #    a. Passes the input through each layer of the network
    #    b. Applies the weights, biases, and activation functions
    #    c. Produces an output prediction
    # 3. The result is a list of predictions, one for each input in the dataset
    # 4. This step is crucial as it generates the network's current predictions,
    #    which will be used to calculate the loss and subsequently update the weights
    # In PyTorch, this would typically be done with a single tensor operation for efficiency,
    # but here we're using a list comprehension for clarity and to match our custom MLP implementation
    ypred = [n(x) for x in xs]
    # Computing Mean Squared Error (MSE) loss
    # This is a simple loss function for demonstration, similar to what we used in micrograd
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
    
    # Backward pass
    # Zeroing out gradients before backpropagation
    # This is crucial to avoid accumulating gradients from previous iterations
    # In micrograd, we did this manually for each Value object
    for p in n.parameters():
        p.grad = 0.0
    loss.backward()
    
    # Update step: Simple gradient descent
    # This is the same update step we used in micrograd
    # Learning rate is set to 0.1
    for p in n.parameters():
        p.data += -0.1 * p.grad
    
    print(k, loss.data)

0 3.7614611225411547
1 3.586164055770185
2 3.2288574674146737
3 2.391882072245139
4 1.4688342308802875
5 0.7685236492845273
6 0.24036820203407314
7 0.07391691362663215
8 0.05769338818159821
9 0.047098169959730846
10 0.03965150176504377
11 0.03414733995351973
12 0.0299235894113843
13 0.026586570098997824
14 0.023887862214180766
15 0.021663210896747536
16 0.0197998162878694
17 0.01821774061954757
18 0.016858811260829384
19 0.015679722233846147


In [37]:
# New values
ypred

[Value(data=0.970912240517384),
 Value(data=-0.984132482631298),
 Value(data=-0.9102155416160583),
 Value(data=0.9192497838554978)]