In [2]:
import mlx.core as mx
import mlx.nn as nn

In [3]:
import mlx.core as mx
import mlx.nn as nn
# Creating array objects
# MLX uses mx.array() instead of torch.Tensor()
# MLX doesn't require explicit grad tracking (no requires_grad=True)
# By default, MLX uses float32, unlike PyTorch which defaults to float32 but we used .double() for float64 MLX does not support float64
x1 = mx.array([2.0])
x2 = mx.array([0.0])
w1 = mx.array([-3.0])
w2 = mx.array([1.0])
b = mx.array([6.8813735870195432])

# Forward pass: Constructing the computational graph
# This is similar to PyTorch, but MLX handles grad tracking implicitly
# MLX uses a functional approach, so operations create new arrays instead of modifying in-place
n = x1*w1 + x2*w2 + b
o = mx.tanh(n)

# Printing the output value
# MLX arrays can be directly converted to Python scalars using .item()
# Unlike PyTorch, there's no need for .data.item() as MLX doesn't have a separate .data attribute
print(o.item())

# Backward pass and gradient computation
# MLX uses a functional approach with value_and_grad for combined forward and backward passes
# This is different from PyTorch's o.backward() method
def forward(x1, x2, w1, w2, b):
    n = x1*w1 + x2*w2 + b
    return mx.tanh(n).sum()  # Sum to ensure a scalar output
    # MLX's value_and_grad requires a scalar output, hence the .sum()
    # This differs from PyTorch where .backward() can be called on any tensor

# Creating a function that computes both value and gradients
# argnums specifies which inputs we want gradients for (all of them in this case)
# This replaces PyTorch's .backward() and automatic grad accumulation
grad_func = mx.value_and_grad(forward, argnums=[0, 1, 2, 3, 4])

# Compute value and gradients
# This single line replaces separate forward and backward passes in PyTorch
# value is the output of the forward function, grads is a tuple of gradients
value, grads = grad_func(x1, x2, w1, w2, b)

print('---')
# Printing gradients
# In MLX, gradients are returned as a tuple from value_and_grad
# This differs from PyTorch where gradients are stored in .grad attributes
# We use .item() to convert single-element arrays to Python scalars
print('x2', grads[1].item())
print('w2', grads[3].item())
print('x1', grads[0].item())
print('w1', grads[2].item())

0.7071067094802856
---
x2 0.5000001192092896
w2 0.0
x1 -1.5000003576278687
w1 1.000000238418579


In [4]:
# Block 1: Network Setup
import mlx.core as mx
import mlx.nn as nn

class Neuron(nn.Module):
    def __init__(self, nin):
        super().__init__()
        # MLX: We use mx.random.uniform for initialization, similar to PyTorch's nn.Parameter(torch.randn())
        # However, MLX doesn't require explicit Parameter wrapping; it treats these as learnable parameters automatically
        self.w = mx.random.uniform(low=-1, high=1, shape=(nin,))
        self.b = mx.random.uniform(low=-1, high=1, shape=(1,))
    
    def __call__(self, x):
        # MLX: Forward pass is similar to PyTorch, but we use mx.sum instead of torch.sum
        # MLX doesn't have in-place operations, so we don't use += for accumulation
        act = mx.sum(self.w * x) + self.b
        # MLX: We use mx.tanh instead of torch.tanh, but the functionality is the same
        return mx.tanh(act)

    def parameters(self):
        # MLX: This method is similar to PyTorch, but MLX doesn't have a concept of nn.Parameter
        # We simply return the weight and bias arrays
        return [self.w, self.b]

class Layer(nn.Module):
    def __init__(self, nin, nout):
        super().__init__()
        # MLX: This is identical to the PyTorch version in structure
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x):
        # MLX: Forward pass is similar to PyTorch, but we use mx.stack instead of torch.stack for multiple outputs
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else mx.stack(outs)
    
    def parameters(self):
        # MLX: This is functionally identical to the PyTorch version
        return [p for neuron in self.neurons for p in neuron.parameters()]

class MLP(nn.Module):
    def __init__(self, nin, nouts):
        super().__init__()
        # MLX: This initialization is identical to the PyTorch version
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    def __call__(self, x):
        # MLX: Forward pass through layers is identical to PyTorch in structure
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        # MLX: Parameter collection is the same as in PyTorch
        return [p for layer in self.layers for p in layer.parameters()]

In [5]:
# Block 2: Example Usage
# MLX: We use mx.array instead of torch.tensor, but the concept is the same
x = mx.array([2.0, 3.0, -1.0])
n = MLP(3, [4, 4, 1])
# MLX: Forward pass is called the same way as in PyTorch
output = n(x)
print("Example output:", output.item())

Example output: -0.7436423897743225


In [6]:
# Block 3: Training Data Setup
# MLX: We use mx.array instead of torch.tensor for data creation
xs = mx.array([
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
])
ys = mx.array([1.0, -1.0, -1.0, 1.0])

In [8]:
# Block 4: Training Loop (Fixed Version)

def mse_loss(pred, target):
    return mx.mean((pred - target) ** 2)

def forward(params, mlp, x):
    # Reconstruct MLP with given parameters
    param_idx = 0
    for layer in mlp.layers:
        for neuron in layer.neurons:
            neuron.w, neuron.b = params[param_idx:param_idx+2]
            param_idx += 2
    
    # Forward pass
    return mx.stack([mlp(xi) for xi in x])

def loss_fn(params, mlp, x, y):
    ypred = forward(params, mlp, x)
    return mse_loss(ypred, y)

# MLX: We use mx.value_and_grad for automatic differentiation
# This is different from PyTorch's autograd system
loss_and_grad_fn = mx.value_and_grad(loss_fn)

for k in range(20):
    # MLX: We need to explicitly collect parameters before each iteration
    # In PyTorch, parameters are continuously tracked
    params = n.parameters()
    
    # MLX: Compute loss and gradients
    loss, grads = loss_and_grad_fn(params, n, xs, ys)
    
    # Compute predictions (for printing purposes)
    ypred = forward(params, n, xs)
    
    # Update step: Simple gradient descent
    # MLX: We manually update parameters, similar to PyTorch without optimizers
    learning_rate = 0.1
    param_idx = 0
    for layer in n.layers:
        for neuron in layer.neurons:
            # MLX: Parameter updates are done by direct assignment
            # PyTorch would typically use in-place operations (+=)
            neuron.w = neuron.w - learning_rate * grads[param_idx]
            neuron.b = neuron.b - learning_rate * grads[param_idx + 1]
            param_idx += 2
    
    print(f"Iteration {k}, Loss: {loss.item():.6f}")
    if k % 5 == 0:  # Print predictions every 5 iterations
        print(f"Predictions: {ypred.squeeze()}")


Iteration 0, Loss: 1.272661
Predictions: array([-0.743642, 0.0116596, -0.263198, -0.684275], dtype=float32)
Iteration 1, Loss: 1.073713
Iteration 2, Loss: 1.004134
Iteration 3, Loss: 1.002370
Iteration 4, Loss: 1.002011
Iteration 5, Loss: 1.001711
Predictions: array([-0.0149352, 0.0652816, 0.0484395, -0.00359025], dtype=float32)
Iteration 6, Loss: 1.001456
Iteration 7, Loss: 1.001240
Iteration 8, Loss: 1.001056
Iteration 9, Loss: 1.000900
Iteration 10, Loss: 1.000767
Predictions: array([-0.0100767, 0.043739, 0.0323335, -0.00246122], dtype=float32)
Iteration 11, Loss: 1.000653
Iteration 12, Loss: 1.000557
Iteration 13, Loss: 1.000474
Iteration 14, Loss: 1.000405
Iteration 15, Loss: 1.000345
Predictions: array([-0.00678606, 0.0293517, 0.0216606, -0.00167083], dtype=float32)
Iteration 16, Loss: 1.000294
Iteration 17, Loss: 1.000251
Iteration 18, Loss: 1.000214
Iteration 19, Loss: 1.000182


In [9]:
# Block 5: Final Predictions
# MLX: Compute final predictions after training
final_params = n.parameters()
_, final_ypred = forward_and_loss(final_params, n, xs, ys)
print("\nFinal Predictions (ypred):")
print(final_ypred)

# Key differences summary:
# 1. MLX uses functional-style differentiation (mx.value_and_grad) vs PyTorch's autograd
# 2. MLX requires manual parameter management and updates, while PyTorch handles this more automatically
# 3. MLX doesn't have built-in optimizers like PyTorch, requiring manual implementation of optimization algorithms
# 4. MLX uses explicit array operations (mx.array, mx.stack) vs PyTorch's tensor operations
# 5. MLX doesn't require explicit gradient zeroing before each iteration
# 6. MLX's parameter updates are done by reassignment, not in-place operations like in PyTorch


Final Predictions (ypred):
array([[-0.00456464],
       [0.019711],
       [0.0145348],
       [-0.00112792]], dtype=float32)
