# Adam Optimizer (Adaptive Moment Estimation) from Scratch

This notebook implements the Adam optimizer from scratch using only NumPy, including bias correction and optional weight decay (L2 regularization).


## Imports


In [None]:
import numpy as np

print("Libraries imported successfully!")


## Adam Optimizer Implementation


In [None]:
class Adam:
    """
    Adam (Adaptive Moment Estimation) optimizer implemented from scratch using NumPy.
    
    Adam combines the advantages of two other extensions of stochastic gradient descent:
    - Adaptive Gradient Algorithm (AdaGrad) that maintains per-parameter learning rates
    - Root Mean Square Propagation (RMSProp) that maintains a moving average of squared gradients
    
    Parameters:
    -----------
    lr : float, optional
        Learning rate (step size) for weight updates (default: 0.001)
    beta1 : float, optional
        Exponential decay rate for the first moment estimates (default: 0.9)
    beta2 : float, optional
        Exponential decay rate for the second moment estimates (default: 0.999)
    eps : float, optional
        Small constant for numerical stability (default: 1e-8)
    weight_decay : float, optional
        L2 regularization coefficient (default: 0.0)
    
    Examples:
    --------
    >>> optimizer = Adam(lr=0.01, beta1=0.9, beta2=0.999, weight_decay=0.01)
    >>> optimizer.step(weights, gradients)
    """
    
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.0):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.weight_decay = weight_decay
        
        # Dictionary to store first moment (momentum) for each parameter set
        self.m = {}
        # Dictionary to store second moment (uncentered variance) for each parameter set
        self.v = {}
        # Dictionary to store step count for bias correction
        self.t = {}
    
    def step(self, params, grads):
        """
        Perform a single Adam optimization step.
        
        Parameters:
        -----------
        params : numpy.ndarray
            Model parameters (weights) to be updated (modified in-place)
        grads : numpy.ndarray
            Gradients with respect to the parameters
        
        Algorithm:
        ----------
        1. Apply weight decay if enabled: g_t = g_t + weight_decay * w_t
        2. Update biased first moment estimate: m_t = beta1 * m_{t-1} + (1 - beta1) * g_t
        3. Update biased second raw moment estimate: v_t = beta2 * v_{t-1} + (1 - beta2) * g_t^2
        4. Compute bias-corrected first moment: m_hat_t = m_t / (1 - beta1^t)
        5. Compute bias-corrected second moment: v_hat_t = v_t / (1 - beta2^t)
        6. Update parameters: w_t = w_{t-1} - lr * m_hat_t / (sqrt(v_hat_t) + eps)
        """
        # Get unique identifier for this parameter set
        param_id = id(params)
        
        # Initialize moment estimates and step count if not exists
        if param_id not in self.m:
            self.m[param_id] = np.zeros_like(params)
            self.v[param_id] = np.zeros_like(params)
            self.t[param_id] = 0
        
        # Increment step count
        self.t[param_id] += 1
        t = self.t[param_id]
        
        # Apply L2 regularization (weight decay) to gradients
        if self.weight_decay > 0:
            grads = grads + self.weight_decay * params
        
        # Update biased first moment estimate
        # m_t = beta1 * m_{t-1} + (1 - beta1) * g_t
        self.m[param_id] = self.beta1 * self.m[param_id] + (1 - self.beta1) * grads
        
        # Update biased second raw moment estimate
        # v_t = beta2 * v_{t-1} + (1 - beta2) * g_t^2
        self.v[param_id] = self.beta2 * self.v[param_id] + (1 - self.beta2) * (grads ** 2)
        
        # Compute bias-corrected first moment estimate
        # m_hat_t = m_t / (1 - beta1^t)
        m_hat = self.m[param_id] / (1 - self.beta1 ** t)
        
        # Compute bias-corrected second raw moment estimate
        # v_hat_t = v_t / (1 - beta2^t)
        v_hat = self.v[param_id] / (1 - self.beta2 ** t)
        
        # Update parameters
        # w_t = w_{t-1} - lr * m_hat_t / (sqrt(v_hat_t) + eps)
        params[:] = params - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)


## Numerical Example: Parameter Updates Over Several Steps


In [None]:
# Example from requirements - exact code provided
import numpy as np

np.random.seed(42)
W = np.random.randn(2, 2)
grads = np.random.randn(2, 2)

adam = Adam(lr=0.01)

for i in range(5):
    adam.step(W, grads)
    print(f"Step {i+1}, Updated Weights:\n", W)


## Detailed Example: Showing Intermediate Values


In [None]:
# Detailed example showing intermediate values (m_t, v_t, m_hat_t, v_hat_t)
np.random.seed(42)
W = np.random.randn(2, 2)
grads = np.random.randn(2, 2)

print("Initial weights W:")
print(W)
print("\nInitial gradients:")
print(grads)
print("\n" + "="*70 + "\n")

adam = Adam(lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8)

param_id = id(W)

for i in range(5):
    print(f"Step {i+1}:")
    print(f"  Learning rate: {adam.lr}")
    print(f"  Beta1: {adam.beta1}, Beta2: {adam.beta2}, Eps: {adam.eps}")
    
    # Get current step count (will be used in this step)
    current_t = adam.t.get(param_id, 0)
    next_t = current_t + 1
    print(f"  Step count for this iteration (t): {next_t}")
    
    # Store previous values for display
    m_prev = adam.m.get(param_id, np.zeros_like(W)).copy()
    v_prev = adam.v.get(param_id, np.zeros_like(W)).copy()
    W_prev = W.copy()
    
    print(f"\n  Before step():")
    print(f"    m_{i}: {m_prev}")
    print(f"    v_{i}: {v_prev}")
    print(f"    W_{i}: {W_prev}")
    print(f"    Current gradients: {grads}")
    
    # Perform step
    adam.step(W, grads)
    
    # Now get updated values
    t = adam.t[param_id]  # This is now next_t
    
    print(f"\n  After step() - Updated moments:")
    print(f"    m_{i+1} = beta1 * m_{i} + (1 - beta1) * g")
    print(f"    m_{i+1} = {adam.beta1} * {m_prev} + {1 - adam.beta1} * {grads}")
    print(f"    m_{i+1}: {adam.m[param_id]}")
    print(f"    v_{i+1} = beta2 * v_{i} + (1 - beta2) * g^2")
    print(f"    v_{i+1} = {adam.beta2} * {v_prev} + {1 - adam.beta2} * {grads**2}")
    print(f"    v_{i+1}: {adam.v[param_id]}")
    
    # Bias correction
    m_hat = adam.m[param_id] / (1 - adam.beta1 ** t)
    v_hat = adam.v[param_id] / (1 - adam.beta2 ** t)
    
    print(f"\n  Bias correction (t={t}):")
    print(f"    m_hat_{i+1} = m_{i+1} / (1 - beta1^{t})")
    print(f"    m_hat_{i+1} = {adam.m[param_id]} / {1 - adam.beta1 ** t}")
    print(f"    m_hat_{i+1}: {m_hat}")
    print(f"    v_hat_{i+1} = v_{i+1} / (1 - beta2^{t})")
    print(f"    v_hat_{i+1} = {adam.v[param_id]} / {1 - adam.beta2 ** t}")
    print(f"    v_hat_{i+1}: {v_hat}")
    
    print(f"\n  Parameter update:")
    denominator = np.sqrt(v_hat) + adam.eps
    print(f"    sqrt(v_hat_{i+1}) + eps: {denominator}")
    print(f"    W_{i+1} = W_{i} - lr * m_hat_{i+1} / (sqrt(v_hat_{i+1}) + eps)")
    print(f"    W_{i+1} = {W_prev} - {adam.lr} * {m_hat} / {denominator}")
    print(f"    Updated W_{i+1}: {W}")
    print("-" * 70)


## Example: Adam with Weight Decay


In [None]:
# Example demonstrating Adam with weight decay
np.random.seed(42)

W = np.random.randn(2, 2)
grads = np.random.randn(2, 2)

print("Adam without weight decay:")
adam_no_decay = Adam(lr=0.01, weight_decay=0.0)
W_no_decay = W.copy()

for i in range(3):
    adam_no_decay.step(W_no_decay, grads)
    print(f"Step {i+1}, W:\n{W_no_decay}\n")

print("\n" + "="*70 + "\n")
print("Adam with weight decay (weight_decay=0.01):")
adam_with_decay = Adam(lr=0.01, weight_decay=0.01)
W_with_decay = W.copy()

for i in range(3):
    adam_with_decay.step(W_with_decay, grads)
    print(f"Step {i+1}, W:\n{W_with_decay}\n")
    print(f"  Weight decay contribution: {0.01 * W_with_decay}")
    print()


## Step-by-Step Explanation

### Adam Algorithm Overview

Adam (Adaptive Moment Estimation) is an optimization algorithm that computes adaptive learning rates for each parameter.

### Key Components:

1. **First Moment (Momentum)**: Tracks the mean of gradients
   - $m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t$

2. **Second Moment (Uncentered Variance)**: Tracks the mean of squared gradients
   - $v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2$

3. **Bias Correction**: Since $m_t$ and $v_t$ are initialized to zero, they are biased towards zero, especially during early iterations. Bias correction accounts for this:
   - $\hat{m}_t = \frac{m_t}{1 - \beta_1^t}$
   - $\hat{v}_t = \frac{v_t}{1 - \beta_2^t}$

4. **Parameter Update**: 
   - $\theta_t = \theta_{t-1} - \alpha \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}$
   - Where $\alpha$ is the learning rate and $\epsilon$ is a small constant for numerical stability

5. **Weight Decay (L2 Regularization)**: If enabled, adds penalty to gradients:
   - $g_t = g_t + \text{weight\_decay} \times \theta_t$

### Advantages:
- Adaptive learning rate per parameter
- Combines benefits of momentum and RMSProp
- Good default hyperparameters work well for many problems
- Bias correction makes early iterations more reliable


In [None]:
# Comparison: Different beta values
np.random.seed(42)

W_1 = np.random.randn(2, 2)
W_2 = np.random.randn(2, 2)
grads = np.random.randn(2, 2)

print("Adam with default betas (beta1=0.9, beta2=0.999):")
adam1 = Adam(lr=0.01, beta1=0.9, beta2=0.999)
for i in range(3):
    adam1.step(W_1, grads)
    print(f"Step {i+1}: {W_1}")

print("\n" + "="*70 + "\n")
print("Adam with lower betas (beta1=0.5, beta2=0.5):")
adam2 = Adam(lr=0.01, beta1=0.5, beta2=0.5)
for i in range(3):
    adam2.step(W_2, grads)
    print(f"Step {i+1}: {W_2}")
