# Gradient Descent Variants From Scratch

## First we implement some helper classes
DataLoader class: Handle splitting data into batches. 
It takes X, y, number of batches, shuffle.
It has get_batches() function that split the data, shuffle and generate batches efficiently by using a Generator.

LinearRegression class: Simple linear regression

In [1]:
# Import libraries
import numpy as np
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
class DataLoader:
    def __init__(self, X, y, batch_size=32, shuffle=True):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle

    def get_data(self):
        indices = list(range(self.X.shape[0]))
        if self.shuffle:
            np.random.shuffle(indices)
            
        for i in indices:
            yield self.X[i], self.y[i]

    def get_random_sample(self):
        rand_i = np.random.randint(self.X.shape[0])
        return self.X[rand_i], self.y[rand_i]
    
    def get_batches(self):
        indices = list(range(self.X.shape[0]))
        if self.shuffle:
            np.random.shuffle(indices)
        # Batching Data
        for i in range(0, len(indices), self.batch_size):
            batch_indices = indices[i:i+self.batch_size]
            yield self.X[batch_indices], self.y[batch_indices]
            

In [3]:
class LinearRegression:
    def __init__(self, n_inputs):
        self.n_inputs = n_inputs
        self.w = np.zeros((n_inputs, 1))
        self.b = np.zeros(1)

    def params(self):
        return self.w, self.b

    def predict(self, X):
        return  X @ self.w + self.b

# Batch Gradient Descent #

It's an optimization algorithm that is used to find optimal parameters $\theta$ that minimizes the objective function $L(\theta)$.

Gradient is vector of all partial derivatives of a function where this vector is the direction of the steepest ascent. We take a step in the opposite direction of it (to minimize loss) and scale by a learning rate $\eta$.

In batch gradient descent we use all of the data to compute gradient.

$
\begin{array}{l}
\textbf{Algorithm:} \text{Gradient Descent} \\
\hline
\textbf{Input:} \text{ Learning rate } \eta, \text{ initial parameters } \theta, \text{ objective function } L(\theta), \text{number of epoches} \\
\textbf{Output:} \text{Optimized parameters } \theta^* \\
1. \quad \text{Initialize } \theta \\
2. \quad \text{Repeat for number of epoches or until convergence:} \\
3. \quad \quad \text{Compute gradient } \nabla L(\theta) \\
4. \quad \quad \text{Update parameters: } \theta \leftarrow \theta - \eta L(\theta) \\
5. \quad \text{Return optimized } \theta^* \\
\end{array}
$

The objective function is:
$$
    L(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (\hat{y}_i - y_i)^2
$$
where $ \hat{y} = Xw + b $.

Gradient with respect to parameters $ w $ using the chain rule:
$$
\frac{\partial L}{\partial w} = \frac{1}{m} \sum_{i=1}^{m} (\hat{y}_i - y_i) x_i
$$
In vector notation:
$$
\frac{\partial L}{\partial w} = \frac{1}{m} X^T (Xw + b - y)
$$

Gradient with respect to the bias term $ b $:
$$
\frac{\partial L}{\partial b} = \frac{1}{m} \sum_{i=1}^{m} (\hat{y}_i - y_i)
$$


In [4]:
class BGD:
    def __init__(self, model, X, y, lr, epochs):
        self.model = model
        self.X = X
        self.y = y
        self.lr = lr
        self.epochs = epochs
        
    def loss(self, y, y_hat):
        return np.mean((y - y_hat)**2) / 2
    
    def gradient(self, X, y):
        m = len(y)
        w, b = self.model.params()
        predictions = X @ w + b
        errors = predictions - y
        
        grad_w = (1 / m) * X.T @ errors
        grad_b = (1 / m) * np.sum(errors)
        
        return grad_w, grad_b
    
    def step(self, grad):
        grad_w, grad_b = grad
        w, b = self.model.params()
        w -= self.lr * grad_w
        b -= self.lr * grad_b
        
    def train(self):
        for epoch in range(self.epochs):
            y_pred = self.model.predict(self.X)
            
            # Calculate loss
            loss = self.loss(self.y, y_pred)
            
            # Calculate Gradient
            grad = self.gradient(self.X, self.y)
            
            # Step (Update Params)
            self.step(grad)
            
            if epoch % 10 == 0:
                print(f"{epoch}/{self.epochs} ------ Loss:{loss}")
                
    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        loss = self.loss(y_test, y_pred)
        return loss

## Now lets test it!

In [5]:
# Generate some data to test it
np.random.seed(42)
X = np.random.randn(1000, 7)
true_w = np.array([[2.0], [1.0], [0.5], [-1.0], [1.5], [-0.8], [0.3]])
true_b = 3.0
y = X @ true_w + true_b

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression(n_inputs=7)
optimizerGD = BGD(
    model=model,
    X=X,
    y=y,
    lr=0.01,
    epochs=500
)

print("Initial weights:", model.w.flatten())
print("Initial bias:", model.b.flatten())

print("Training the model...")
optimizerGD.train()

Initial weights: [0. 0. 0. 0. 0. 0. 0.]
Initial bias: [0.]
Training the model...
0/500 ------ Loss:9.084076333040409
10/500 ------ Loss:7.428025105079007
20/500 ------ Loss:6.075156658817498
30/500 ------ Loss:4.9697315820571
40/500 ------ Loss:4.066300916943799
50/500 ------ Loss:3.327798475172903
60/500 ------ Loss:2.723988263206627
70/500 ------ Loss:2.2302006452103074
80/500 ------ Loss:1.8263033269085505
90/500 ------ Loss:1.4958633523044294
100/500 ------ Loss:1.2254645110955615
110/500 ------ Loss:1.0041512172682547
120/500 ------ Loss:0.8229753301363458
130/500 ------ Loss:0.6746267841645981
140/500 ------ Loss:0.5531324647051017
150/500 ------ Loss:0.45361066848720016
160/500 ------ Loss:0.372070846204997
170/500 ------ Loss:0.3052502419376963
180/500 ------ Loss:0.2504806032481112
190/500 ------ Loss:0.20557940384953474
200/500 ------ Loss:0.16876105226654847
210/500 ------ Loss:0.13856439923078648
220/500 ------ Loss:0.11379353961099747
230/500 ------ Loss:0.0934694606856046

In [6]:
loss = optimizerGD.evaluate(X_test, y_test)
print("\nFinal Loss:", loss)

# Print learned parameters
print("Learned weights:", model.w.flatten())
print("Learned bias:", model.b.flatten())
print("True weights:", true_w.flatten())
print("True bias:", true_b)


Final Loss: 0.0004997497837414081
Learned weights: [ 1.98723973  0.99662395  0.4961774  -0.99778266  1.48517637 -0.79651228
  0.30465994]
Learned bias: [2.97558819]
True weights: [ 2.   1.   0.5 -1.   1.5 -0.8  0.3]
True bias: 3.0


# Stochastic Gradient Descent

Same as gradient descent but instead of computing the gradient using all of the training data in each epoch, we use a random sample in each one.
DataLoader class will handle this.

In [7]:
class SGD:
    def __init__(self, model, X, y, lr, epochs):
        self.model = model
        self.lr = lr
        self.epochs = epochs
        self.dl = DataLoader(X, y, batch_size=0, shuffle=True)
        
    def loss(self, y, y_hat):
        return np.mean((y - y_hat)**2) / 2
    
    def gradient(self, X, y):
        m = len(y)
        w, b = self.model.params()
        predictions = X @ w + b
        errors = predictions - y
        grad_w = (1 / m) * X.reshape(-1, 1) @ errors
        grad_b = (1 / m) * np.sum(errors)
        
        return grad_w, grad_b
    
    def step(self, grad):
        grad_w, grad_b = grad
        w, b = self.model.params()

        w -= self.lr * grad_w.reshape(-1, 1)
        b -= self.lr * grad_b
        
    def train(self):
        for epoch in range(self.epochs):
            y_pred = self.model.predict(self.dl.X)
            
            # Calculate loss
            loss = self.loss(self.dl.y, y_pred)
            
            # Calculate Gradient
            sample_X, sample_y = self.dl.get_random_sample()
            grad = self.gradient(sample_X, sample_y)
            
            # Step (Update Params)
            self.step(grad)
            
            if epoch % 10 == 0:
                print(f"{epoch}/{self.epochs} ------ Loss:{loss}")
                
    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        loss = self.loss(y_test, y_pred)
        return loss

In [8]:
print("Initial weights:", model.w.flatten()) # Same initial weights
print("Initial bias:", model.b.flatten())

model_2 = LinearRegression(n_inputs=7) # Initialize model again to reset parameters
optimizerSGD = SGD(
    model=model_2,
    X=X,
    y=y,
    lr=0.01,
    epochs=500
)

print("Training the model...")
optimizerSGD.train()

Initial weights: [ 1.98723973  0.99662395  0.4961774  -0.99778266  1.48517637 -0.79651228
  0.30465994]
Initial bias: [2.97558819]
Training the model...
0/500 ------ Loss:9.084076333040409
10/500 ------ Loss:7.546580196433368
20/500 ------ Loss:6.44018419153675
30/500 ------ Loss:5.623505924228909
40/500 ------ Loss:4.6316845295947715
50/500 ------ Loss:4.306092072656013
60/500 ------ Loss:3.750043385481879
70/500 ------ Loss:2.9734068225508143
80/500 ------ Loss:2.213266589066577
90/500 ------ Loss:1.9454591325695223
100/500 ------ Loss:1.7712665574041162
110/500 ------ Loss:1.5572975403421272
120/500 ------ Loss:1.1488232712178688
130/500 ------ Loss:0.9947401428762147
140/500 ------ Loss:0.8237668988130463
150/500 ------ Loss:0.7066872888227677
160/500 ------ Loss:0.5583453512519764
170/500 ------ Loss:0.4791856737800803
180/500 ------ Loss:0.4243203029910982
190/500 ------ Loss:0.33287578841758136
200/500 ------ Loss:0.28025348698771996
210/500 ------ Loss:0.24291569684092423
220/5

In [9]:
loss_2 = optimizerSGD.evaluate(X_test, y_test)
print("\nFinal Loss:", loss_2)

# Print learned parameters
print("Learned weights:", model_2.w.flatten())
print("Learned bias:", model_2.b.flatten())
print("True weights:", true_w.flatten())
print("True bias:", true_b)


Final Loss: 0.000326817971718045
Learned weights: [ 1.98981612  0.9942291   0.50267473 -0.99495884  1.49444201 -0.80401952
  0.29786237]
Learned bias: [2.97957605]
True weights: [ 2.   1.   0.5 -1.   1.5 -0.8  0.3]
True bias: 3.0


## Mini-Batch Gradient Descent
Now instead of using entire training data at once, we split it into small batches. We can introduce randomness by shuffling.

In [10]:
class MBGD:
    def __init__(self, model, X, y, lr, batch_size, shuffle, epochs):
        self.model = model
        self.lr = lr
        self.shuffle = shuffle
        self.epochs = epochs
        self.dl = DataLoader(X, y, batch_size, shuffle)
        
    def loss(self, y, y_hat):
        return np.mean((y - y_hat)**2) / 2
    
    def gradient(self, X, y):
        m = len(y)
        w, b = self.model.params()
        predictions = X @ w + b
        errors = predictions - y
        
        grad_w = (1 / m) * X.T @ errors
        grad_b = (1 / m) * np.sum(errors)
        
        return grad_w, grad_b
    
    def step(self, grad):
        grad_w, grad_b = grad
        w, b = self.model.params()
        w -= self.lr * grad_w
        b -= self.lr * grad_b
        
    def train(self):
        for epoch in range(self.epochs):
            y_pred = self.model.predict(self.dl.X)
            
            # Calculate loss
            loss = self.loss(self.dl.y, y_pred)
            
            for batch_X, batch_y in self.dl.get_batches():
                # Calculate Gradient
                grad = self.gradient(batch_X, batch_y)
            
                # Step (Update Params)
                self.step(grad)
            
            if epoch % 10 == 0:
                print(f"{epoch}/{self.epochs} ------ Loss:{loss}")
                
    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        loss = self.loss(y_test, y_pred)
        return loss

In [11]:
print("Initial weights:", model.w.flatten()) # Same initial weights
print("Initial bias:", model.b.flatten())

model_3 = LinearRegression(n_inputs=7) # Initialize model again to reset parameters
optimizerMBGD = MBGD(
    model=model_3,
    X=X,
    y=y,
    lr=0.01,
    batch_size=32,
    shuffle=True,
    epochs=500
)

print("Training the model...")
optimizerMBGD.train()

Initial weights: [ 1.98723973  0.99662395  0.4961774  -0.99778266  1.48517637 -0.79651228
  0.30465994]
Initial bias: [2.97558819]
Training the model...
0/500 ------ Loss:9.084076333040409
10/500 ------ Loss:0.015799170452084593
20/500 ------ Loss:3.5388478622233407e-05
30/500 ------ Loss:9.293554512849136e-08
40/500 ------ Loss:2.7355031985888855e-10
50/500 ------ Loss:8.710171132838194e-13
60/500 ------ Loss:2.9389255319659832e-15
70/500 ------ Loss:1.0806239964752252e-17
80/500 ------ Loss:3.969720948568418e-20
90/500 ------ Loss:1.4967839401916643e-22
100/500 ------ Loss:5.5525702411967775e-25
110/500 ------ Loss:2.1685291027788554e-27
120/500 ------ Loss:1.592320421050237e-28
130/500 ------ Loss:1.436362866607076e-28
140/500 ------ Loss:1.3914569595773698e-28
150/500 ------ Loss:1.3837076337787377e-28
160/500 ------ Loss:1.361299793246902e-28
170/500 ------ Loss:1.3470297925095198e-28
180/500 ------ Loss:1.3479961471184155e-28
190/500 ------ Loss:1.345201853880703e-28
200/500 ----

In [12]:
loss_3 = optimizerMBGD.evaluate(X_test, y_test)
print("\nFinal Loss:", loss_3)

# Print learned parameters
print("Learned weights:", model_3.w.flatten())
print("Learned bias:", model_3.b.flatten())
print("True weights:", true_w.flatten())
print("True bias:", true_b)


Final Loss: 1.3644397061687145e-28
Learned weights: [ 2.   1.   0.5 -1.   1.5 -0.8  0.3]
Learned bias: [3.]
True weights: [ 2.   1.   0.5 -1.   1.5 -0.8  0.3]
True bias: 3.0


## Momentum Based Gradient Descent
It helps to speed up training by smoothing out updates and reducing oscillations. It remembers past gradients, giving the optimizer "momentum" to move faster and avoid getting stuck in narrow valleys.
The update rules:
$$
\text{Velocity update:} \quad v_{t} = \beta v_{t-1} + \eta \nabla_{\theta} L(\theta_{t-1})
$$
$$
\text{Parameter update:} \quad \theta_{t} = \theta_{t-1} - v_{t}
$$

Where:
$v_t$ : Velocity at time step $t$.

$\beta$: Momentum hyperparameter.

$\eta$: Learning rate.

$\nabla_{\theta} L(\theta_t)$: Gradient of the cost function $\theta$ with respect to parameters $\theta$ at time step $t$.

$\theta_t$: Model parameters at time step $t$.

In [13]:
class MGD:
    def __init__(self, model, X, y, lr, batch_size, shuffle, epochs, momentum=0.9):
        self.model = model
        self.lr = lr
        self.shuffle = shuffle
        self.epochs = epochs
        self.momentum = momentum
        self.dl = DataLoader(X, y, batch_size, shuffle)
        
        # Initialize velocity terms to zero
        w, b = self.model.params()
        self.v_w = np.zeros_like(w)
        self.v_b = np.zeros_like(b)
        
    def loss(self, y, y_hat):
        return np.mean((y - y_hat)**2) / 2
    
    def gradient(self, X, y):
        m = len(y)
        w, b = self.model.params()
        predictions = X @ w + b
        errors = predictions - y
        
        grad_w = ((1 / m) * X.T @ errors).reshape(-1, 1)
        grad_b = (1 / m) * np.sum(errors)
        
        return grad_w, grad_b
    
    def step(self, grad):
        grad_w, grad_b = grad
        
        # Update velocity
        self.v_w = self.momentum * self.v_w + self.lr * grad_w
        self.v_b = self.momentum * self.v_b + self.lr * grad_b
        
        # Update parameters
        self.model.w -= self.v_w
        self.model.b -= self.v_b
        
    def train(self):
        for epoch in range(self.epochs):
            y_pred = self.model.predict(self.dl.X)
            
            # Calculate loss
            loss = self.loss(self.dl.y, y_pred)
            
            for batch_X, batch_y in self.dl.get_batches():
                # Calculate Gradient
                grad = self.gradient(batch_X, batch_y)
            
                # Step (Update Params)
                self.step(grad)
            
            if epoch % 10 == 0:
                print(f"{epoch}/{self.epochs} ------ Loss:{loss}")
                
    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        loss = self.loss(y_test, y_pred)
        return loss

In [14]:
print("Initial weights:", model.w.flatten()) # Same initial weights
print("Initial bias:", model.b.flatten())

model_4 = LinearRegression(n_inputs=7) # Initialize model again to reset parameters
optimizerMGD = MGD(
    model=model_4,
    X=X,
    y=y,
    lr=0.01,
    batch_size=32,
    shuffle=True,
    epochs=500
)

print("Training the model...")
optimizerMGD.train()

Initial weights: [ 1.98723973  0.99662395  0.4961774  -0.99778266  1.48517637 -0.79651228
  0.30465994]
Initial bias: [2.97558819]
Training the model...
0/500 ------ Loss:9.084076333040409
10/500 ------ Loss:3.2319091793310056e-14
20/500 ------ Loss:2.0379467138144223e-28
30/500 ------ Loss:8.884792464084526e-31
40/500 ------ Loss:8.884792464084526e-31
50/500 ------ Loss:8.884792464084526e-31
60/500 ------ Loss:8.884792464084526e-31
70/500 ------ Loss:8.884792464084526e-31
80/500 ------ Loss:8.884792464084526e-31
90/500 ------ Loss:8.884792464084526e-31
100/500 ------ Loss:8.884792464084526e-31
110/500 ------ Loss:8.884792464084526e-31
120/500 ------ Loss:8.884792464084526e-31
130/500 ------ Loss:8.884792464084526e-31
140/500 ------ Loss:8.884792464084526e-31
150/500 ------ Loss:8.884792464084526e-31
160/500 ------ Loss:8.884792464084526e-31
170/500 ------ Loss:8.884792464084526e-31
180/500 ------ Loss:8.884792464084526e-31
190/500 ------ Loss:8.884792464084526e-31
200/500 ------ Loss:

In [15]:
loss_4 = optimizerMBGD.evaluate(X_test, y_test)
print("\nFinal Loss:", loss_4)

# Print learned parameters
print("Learned weights:", model_4.w.flatten())
print("Learned bias:", model_4.b.flatten())
print("True weights:", true_w.flatten())
print("True bias:", true_b)


Final Loss: 1.3644397061687145e-28
Learned weights: [ 2.   1.   0.5 -1.   1.5 -0.8  0.3]
Learned bias: [3.]
True weights: [ 2.   1.   0.5 -1.   1.5 -0.8  0.3]
True bias: 3.0


# Performance Comparison

In [16]:
pd.DataFrame(
    {
        'Algorithm':['Batch GD', 'Stochastic GD', 'Mini-Batch GD', 'Momentum GD'],
    'Loss':[loss, loss_2, loss_3, loss_4]
    }
).T

Unnamed: 0,0,1,2,3
Algorithm,Batch GD,Stochastic GD,Mini-Batch GD,Momentum GD
Loss,0.0005,0.000327,0.0,0.0
