# Stochastic Gradient Descent (SGD) from Scratch

This notebook implements Stochastic Gradient Descent (SGD) optimizer from scratch using only NumPy, including support for momentum and weight decay (L2 regularization).


## Imports


In [1]:
import numpy as np

print("Libraries imported successfully!")


Libraries imported successfully!


## SGD Optimizer Implementation


In [3]:
class SGD:
    """
    Stochastic Gradient Descent (SGD) optimizer implemented from scratch using NumPy.
    
    Supports:
    - Vanilla SGD (when momentum=0 and weight_decay=0)
    - Momentum SGD
    - L2 regularization (weight decay)
    - Combined momentum and weight decay
    
    Parameters:
    -----------
    lr : float
        Learning rate for weight updates (default: 0.01)
    momentum : float, optional
        Momentum factor. When > 0, uses momentum SGD (default: 0.0)
    weight_decay : float, optional
        L2 regularization coefficient (default: 0.0)
    
    Examples:
    --------
    >>> optimizer = SGD(lr=0.1, momentum=0.9, weight_decay=0.01)
    >>> optimizer.step(weights, gradients)
    """
    
    def __init__(self, lr=0.01, momentum=0.0, weight_decay=0.0):
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.velocity = {}  # Dictionary to store velocity for each parameter set
    
    def step(self, params, grads):
        """
        Perform a single SGD optimization step.
        
        Parameters:
        -----------
        params : numpy.ndarray
            Model parameters (weights) to be updated (modified in-place)
        grads : numpy.ndarray
            Gradients with respect to the parameters
        
        Notes:
        ------
        - If weight_decay > 0, L2 regularization is applied: g_i = g_i + weight_decay * w_i
        - If momentum > 0, momentum update is used:
          v_i = momentum * v_i - lr * g_i
          w_i = w_i + v_i
        - Otherwise, vanilla SGD: w_i = w_i - lr * g_i
        """
        # Get unique identifier for this parameter set (based on memory address)
        param_id = id(params)
        
        # Apply L2 regularization (weight decay) to gradients
        if self.weight_decay > 0:
            grads = grads + self.weight_decay * params
        
        # Apply momentum if enabled
        if self.momentum > 0:
            # Initialize velocity if not exists
            if param_id not in self.velocity:
                self.velocity[param_id] = np.zeros_like(params)
            
            # Update velocity: v_i = momentum * v_i - lr * g_i
            self.velocity[param_id] = self.momentum * self.velocity[param_id] - self.lr * grads
            
            # Update parameters: w_i = w_i + v_i
            params[:] = params + self.velocity[param_id]
        else:
            # Vanilla SGD: w_i = w_i - lr * g_i
            params[:] = params - self.lr * grads


## Numerical Example: Weight Updates Over Iterations


In [4]:
# Example from requirements - exact code provided
import numpy as np

np.random.seed(0)
W = np.random.randn(2, 2)
grads = np.random.randn(2, 2)

sgd = SGD(lr=0.1, momentum=0.9, weight_decay=0.01)

for i in range(5):
    sgd.step(W, grads)
    print(f"Iteration {i+1}, Updated W:\n", W)


Iteration 1, Updated W:
 [[1.57553249 0.49748484]
 [0.8827504  2.25378803]]
Iteration 2, Updated W:
 [[1.2175333  0.68231001]
 [0.70046999 2.2782753 ]]
Iteration 3, Updated W:
 [[0.70736069 0.94569814]
 [0.44070831 2.3131713 ]]
Iteration 4, Updated W:
 [[0.06074218 1.27952955]
 [0.11147324 2.35740025]]
Iteration 5, Updated W:
 [[-0.70803102  1.67642608]
 [-0.27995863  2.40998462]]


## Example: Comparing Vanilla SGD vs Momentum SGD vs Weight Decay


In [5]:
# Demonstration comparing different SGD variants
np.random.seed(42)

# Same initial conditions for fair comparison
W_vanilla = np.random.randn(2, 2)
W_momentum = W_vanilla.copy()
W_decay = W_vanilla.copy()
W_both = W_vanilla.copy()

grads = np.random.randn(2, 2)

print("Initial weights (all methods start with same values):")
print(W_vanilla)
print("\n" + "="*60 + "\n")

# Vanilla SGD
print("1. Vanilla SGD (lr=0.1, momentum=0, weight_decay=0):")
sgd_vanilla = SGD(lr=0.1, momentum=0.0, weight_decay=0.0)
for i in range(3):
    sgd_vanilla.step(W_vanilla, grads)
    if i == 0:
        print(f"   After iteration {i+1}:")
        print(f"   {W_vanilla}")

# Momentum SGD
print("\n2. Momentum SGD (lr=0.1, momentum=0.9, weight_decay=0):")
sgd_momentum = SGD(lr=0.1, momentum=0.9, weight_decay=0.0)
for i in range(3):
    sgd_momentum.step(W_momentum, grads)
    if i == 0:
        print(f"   After iteration {i+1}:")
        print(f"   {W_momentum}")

# Weight Decay SGD
print("\n3. Weight Decay SGD (lr=0.1, momentum=0, weight_decay=0.01):")
sgd_decay = SGD(lr=0.1, momentum=0.0, weight_decay=0.01)
for i in range(3):
    sgd_decay.step(W_decay, grads)
    if i == 0:
        print(f"   After iteration {i+1}:")
        print(f"   {W_decay}")

# Both Momentum and Weight Decay
print("\n4. Momentum + Weight Decay (lr=0.1, momentum=0.9, weight_decay=0.01):")
sgd_both = SGD(lr=0.1, momentum=0.9, weight_decay=0.01)
for i in range(3):
    sgd_both.step(W_both, grads)
    if i == 0:
        print(f"   After iteration {i+1}:")
        print(f"   {W_both}")


Initial weights (all methods start with same values):
[[ 0.49671415 -0.1382643 ]
 [ 0.64768854  1.52302986]]


1. Vanilla SGD (lr=0.1, momentum=0, weight_decay=0):
   After iteration 1:
   [[ 0.52012949 -0.11485061]
 [ 0.48976726  1.44628638]]

2. Momentum SGD (lr=0.1, momentum=0.9, weight_decay=0):
   After iteration 1:
   [[ 0.52012949 -0.11485061]
 [ 0.48976726  1.44628638]]

3. Weight Decay SGD (lr=0.1, momentum=0, weight_decay=0.01):
   After iteration 1:
   [[ 0.51963278 -0.11471234]
 [ 0.48911957  1.44476335]]

4. Momentum + Weight Decay (lr=0.1, momentum=0.9, weight_decay=0.01):
   After iteration 1:
   [[ 0.51963278 -0.11471234]
 [ 0.48911957  1.44476335]]


## Step-by-Step Explanation

### 1. L2 Regularization (Weight Decay)
When `weight_decay > 0`, the gradient is modified:
```
g_i = g_i + weight_decay × w_i
```
This adds a penalty term proportional to the weight magnitude, encouraging smaller weights.

### 2. Momentum Update
When `momentum > 0`, the update rule becomes:
```
v_i = momentum × v_i - lr × g_i
w_i = w_i + v_i
```
This helps accelerate convergence and reduce oscillations.

### 3. Vanilla SGD
When `momentum = 0` and `weight_decay = 0`:
```
w_i = w_i - lr × g_i
```
Simple gradient descent update.


In [6]:
# Detailed step-by-step calculation for one iteration
np.random.seed(0)

# Initialize
W = np.random.randn(2, 2)
grads = np.random.randn(2, 2)

print("Step-by-step calculation:")
print(f"\nInitial W:\n{W}")
print(f"\nInitial gradients:\n{grads}")

# Create optimizer
sgd = SGD(lr=0.1, momentum=0.9, weight_decay=0.01)

# Show intermediate steps
param_id = id(W)
print(f"\n--- Before step() ---")
print(f"Weight decay: {sgd.weight_decay}")
print(f"Momentum: {sgd.momentum}")
print(f"Learning rate: {sgd.lr}")

# Apply weight decay manually for demonstration
if sgd.weight_decay > 0:
    grads_with_decay = grads + sgd.weight_decay * W
    print(f"\nGradients after weight decay:\n{grads_with_decay}")
    print(f"  (grads + {sgd.weight_decay} * W)")
else:
    grads_with_decay = grads

# Apply momentum manually for demonstration
if sgd.momentum > 0:
    # Initialize velocity (as done in step method)
    if param_id not in sgd.velocity:
        sgd.velocity[param_id] = np.zeros_like(W)
    print(f"\nInitial velocity:\n{sgd.velocity[param_id]}")
    
    # Update velocity
    sgd.velocity[param_id] = sgd.momentum * sgd.velocity[param_id] - sgd.lr * grads_with_decay
    print(f"\nUpdated velocity:\n{sgd.velocity[param_id]}")
    print(f"  (momentum * v - lr * g)")
    print(f"  ({sgd.momentum} * v - {sgd.lr} * g)")
    
    W_new = W + sgd.velocity[param_id]
    print(f"\nNew weights:\n{W_new}")
    print(f"  (W + velocity)")

# Now call the actual step method for verification
np.random.seed(0)
W_actual = np.random.randn(2, 2)
sgd_actual = SGD(lr=0.1, momentum=0.9, weight_decay=0.01)
sgd_actual.step(W_actual, grads)
print(f"\n--- After step() ---")
print(f"Final W:\n{W_actual}")


Step-by-step calculation:

Initial W:
[[1.76405235 0.40015721]
 [0.97873798 2.2408932 ]]

Initial gradients:
[[ 1.86755799 -0.97727788]
 [ 0.95008842 -0.15135721]]

--- Before step() ---
Weight decay: 0.01
Momentum: 0.9
Learning rate: 0.1

Gradients after weight decay:
[[ 1.88519851 -0.97327631]
 [ 0.9598758  -0.12894828]]
  (grads + 0.01 * W)

Initial velocity:
[[0. 0.]
 [0. 0.]]

Updated velocity:
[[-0.18851985  0.09732763]
 [-0.09598758  0.01289483]]
  (momentum * v - lr * g)
  (0.9 * v - 0.1 * g)

New weights:
[[1.57553249 0.49748484]
 [0.8827504  2.25378803]]
  (W + velocity)

--- After step() ---
Final W:
[[1.57553249 0.49748484]
 [0.8827504  2.25378803]]
