# Imports

In [None]:
import numpy as np

np.__version__

'1.25.2'

In [None]:
np.random.seed(42) # reproducibility

# Introduction to Gradient Descent

Gradient descent is a fundamental optimization algorithm used in machine learning and optimization problems to minimize the cost function or loss function. The concept revolves around iteratively adjusting the parameters of a model in the direction of the steepest descent of the gradient of the cost function. The gradient represents the direction of the steepest increase in the function. In each iteration, the algorithm calculates the gradient of the cost function with respect to the parameters, and then updates the parameters in the opposite direction of the gradient by a certain step size known as the learning rate. This process continues until convergence, where the gradient becomes nearly zero, indicating that the algorithm has reached a local minimum.

# Prerequisites

We will take a sigmoid neuron to show the process of gradient descent algorithms and it's different variations (with optimization techniques)

### Function to Replicate

**The Sigmoid Function**

The sigmoid function is a mathematical function that maps any real-valued number to a value between 0 and 1. It introduces non-linearity to the network and enables it to learn complex patterns in the data.

**Properties**

- **Range**: The output of the sigmoid function is always between 0 and 1. As \( x \) approaches negative infinity, the output approaches 0, and as \( x \) approaches positive infinity, the output approaches 1.
- **Smoothness**: The sigmoid function is smooth and differentiable everywhere, which makes it suitable for optimization algorithms such as gradient descent.
- **S-shaped curve**: The graph of the sigmoid function resembles the letter "S", hence the name "sigmoid". This shape introduces non-linearity to the network, allowing it to model complex relationships in the data.


In [None]:
def sigmoid(W: np.array, b: np.array, X: np.array):
    return (1 / (1 + np.exp(-(np.dot(X, W) + b))))

Instead of the formula and array sizes of np.dot.. we can use something like [this](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)



### Loss Function

Mean squared error (MSE) is a fundamental metric used in statistics and machine learning to quantify the average squared difference between the actual values and the predicted values. It is calculated by taking the average of the squared differences between the predicted and true values for each data point.

MSE is favored for its mathematical properties, such as being non-negative and sensitive to the magnitude of errors, making it a widely adopted measure for evaluating model performance and guiding optimization efforts.

In [None]:
def error(W: np.array, b: np.array, X: np.array, y_true: np.array):
    m = X.shape[0]

    y_hat = sigmoid(W, b, X)

    cost = np.sum((y_hat - y_true)**2, axis = 0) / m

    return cost

## Generating the Data

In [None]:
# Taking number of features as 2 and instances as 100
n_features = 2
m = 100

In [None]:
W_true = np.full((n_features, 1), 0.5)
b_true = 1

# We are only focused on the algorithm, so we generate this randomly

X = np.random.rand(m, n_features)

y_true = sigmoid(W_true, b_true, X)
y_true.shape, X.shape

((100, 1), (100, 2))

## Calculating Gradients

In [None]:
def grad(W: np.array, b: np.array, X: np.array, y_true: np.array):
    n_features = X.shape[0]

    # Calculating predicted y
    y_hat = sigmoid(W, b, X)
    grad_W_i = np.zeros((n_features, 1))

    for j in range(n_features):
        grad_W_i[j] = np.dot((y_hat - y_true) * (y_hat) * (1 - y_hat), X[j])
    grad_b_i = (y_hat - y_true) * (y_hat) * (1 - y_hat)

    return grad_W_i, grad_b_i

## Initializing Weights & Biases

In [None]:
def init_weights(n_features):
    W = np.random.rand(n_features, 1)
    b = 1
    return W, b

# Vanilla (Batch) Gradient Descent

In [None]:
def batch_gradient_descent(X: np.array, y_true: np.array, epochs = 100, learning_rate = 0.1):
    m = X.shape[0]
    W, b = init_weights(n_features)

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i


        # Update weights and bias
        W -= learning_rate * grad_W
        b -= learning_rate * grad_b

        # Printing the Progress
        if epoch % 20 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
batch_gradient_descent(X, y_true, epochs = 100, learning_rate = 0.03)

Epoch: 20
Weights: [[0.65266844]
 [0.16722706]]
Bias: [1.08095925]
-----------------------------------
Epoch: 40
Weights: [[0.63362175]
 [0.20515667]]
Bias: [1.08376961]
-----------------------------------
Epoch: 60
Weights: [[0.61403602]
 [0.23517087]]
Bias: [1.07853052]
-----------------------------------
Epoch: 80
Weights: [[0.59671899]
 [0.26157269]]
Bias: [1.07278195]
-----------------------------------
Epoch: 100
Weights: [[0.58172374]
 [0.28517517]]
Bias: [1.06741332]
-----------------------------------


(array([[0.58172374],
        [0.28517517]]),
 array([1.06741332]))

# Stochastic Gradient Descent

*   Here, the parameters are updates for every data point passed through in the epoch. (Number of updates per epoch are number of instances in the data)
*   It is an approximate (rather stochastic) gradient.
*   No guarantee that the loss decreases every step.



In [None]:
def stochastic_gradient_descent(X: np.array, y_true: np.array, epochs = 100, learning_rate = 0.1):
    m = X.shape[0]
    W, b = init_weights(n_features)

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

            # Update weights and bias
            W -= learning_rate * grad_W
            b -= learning_rate * grad_b

        # Printing the Progress
        if epoch % 10 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
stochastic_gradient_descent(X, y_true, epochs = 100, learning_rate = 0.03)

Epoch: 10
Weights: [[0.49435003]
 [0.51465431]]
Bias: [0.99881913]
-----------------------------------
Epoch: 20
Weights: [[0.50099308]
 [0.50182244]]
Bias: [1.00167229]
-----------------------------------
Epoch: 30
Weights: [[0.50083917]
 [0.5011882 ]]
Bias: [1.00209647]
-----------------------------------
Epoch: 40
Weights: [[0.50080871]
 [0.50115223]]
Bias: [1.00219546]
-----------------------------------
Epoch: 50
Weights: [[0.50082   ]
 [0.50117094]]
Bias: [1.00225238]
-----------------------------------
Epoch: 60
Weights: [[0.50083778]
 [0.50119679]]
Bias: [1.00230473]
-----------------------------------
Epoch: 70
Weights: [[0.50085682]
 [0.50122411]]
Bias: [1.00235751]
-----------------------------------
Epoch: 80
Weights: [[0.5008764 ]
 [0.50125217]]
Bias: [1.00241139]
-----------------------------------
Epoch: 90
Weights: [[0.50089645]
 [0.5012809 ]]
Bias: [1.00246648]
-----------------------------------
Epoch: 100
Weights: [[0.50091695]
 [0.50131028]]
Bias: [1.00252284]
-----

(array([[0.50091695],
        [0.50131028]]),
 array([1.00252284]))

# Mini-Batch Gradient Descent

*   Instead of updating parameters for every data points we only pick a small batch of data points to
*   Better estimate of the 'true' gradient



In [None]:
BATCH_SIZE = 10

In [None]:
def minibatch_gradient_descent(X: np.array, y_true: np.array, epochs = 100, learning_rate = 0.1):
    m = X.shape[0]
    W, b = init_weights(n_features)

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(1, m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

            if i % BATCH_SIZE == 0:
            # Update weights and bias
                W -= learning_rate * grad_W
                b -= learning_rate * grad_b

        # Printing the Progress
        if epoch % 10 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
minibatch_gradient_descent(X, y_true, epochs = 100, learning_rate = 0.03)

Epoch: 10
Weights: [[0.60500726]
 [0.17472198]]
Bias: [1.11611545]
-----------------------------------
Epoch: 20
Weights: [[0.56581798]
 [0.24459342]]
Bias: [1.09671509]
-----------------------------------
Epoch: 30
Weights: [[0.53876492]
 [0.29855737]]
Bias: [1.0806513]
-----------------------------------
Epoch: 40
Weights: [[0.52042415]
 [0.34050999]]
Bias: [1.06752084]
-----------------------------------
Epoch: 50
Weights: [[0.50820345]
 [0.37323379]]
Bias: [1.05667633]
-----------------------------------
Epoch: 60
Weights: [[0.5002627 ]
 [0.39885386]]
Bias: [1.04765572]
-----------------------------------
Epoch: 70
Weights: [[0.49529699]
 [0.4189903 ]]
Bias: [1.04011641]
-----------------------------------
Epoch: 80
Weights: [[0.49238124]
 [0.43487914]]
Bias: [1.03379519]
-----------------------------------
Epoch: 90
Weights: [[0.49086039]
 [0.44746561]]
Bias: [1.02848414]
-----------------------------------
Epoch: 100
Weights: [[0.49027173]
 [0.45747475]]
Bias: [1.02401564]
------

(array([[0.49027173],
        [0.45747475]]),
 array([1.02401564]))

# Introducing Momentum in Gradient Descent

Imagine you're riding a bike down a bumpy hill.

With regular gradient descent, you'd pedal harder when the hill gets steeper and ease off when it levels out. But this can make your ride jerky and slow.

Now, with momentum, you pedal not only based on what you see right in front of you but also on how fast you were going before. So, even if the hill gets a bit bumpy, you keep moving forward more steadily because you're carrying some of your previous speed with you. It's like having a little push from your past self, making your ride smoother and faster overall.

# Momentum-Based Gradient Descent

This enhances traditional gradient descent by incorporating a momentum term. This momentum term allows the algorithm to build inertia in a specific direction during the search process, aiding in faster convergence and overcoming issues like local minima and oscillations.

By adding a fraction of the previous weight update to the current update, momentum-based gradient descent accelerates optimization by smoothing out weight updates, reducing model complexity, and preventing overfitting.

The momentum term is calculated as a moving average of past gradients, with the weight of these past gradients controlled by a hyperparameter called Momentum Constant.

This technique helps the algorithm escape local minima and saddle points, converge faster, and stabilize the optimization process

In [None]:
def momentum_based_gradient_descent(X: np.array, y_true: np.array, epochs = 100, momentum_constant = 0.9, learning_rate = 0.1):
    m = X.shape[0]
    W, b = init_weights(n_features)
    update_W, update_b = np.zeros_like(W), 0

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

        # Dealing with the history
        update_W = momentum_constant * update_W + learning_rate * grad_W
        update_b = momentum_constant * update_b + learning_rate * grad_b

        # Update weights and bias
        W -= update_W
        b -= update_b

        # Printing the Progress
        if epoch % 20 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
momentum_based_gradient_descent(X, y_true, epochs = 100, momentum_constant = 0.9, learning_rate = 0.03)

Epoch: 20
Weights: [[0.33281891]
 [0.55531903]]
Bias: [1.00928586]
-----------------------------------
Epoch: 40
Weights: [[0.49232534]
 [0.47910667]]
Bias: [0.99807405]
-----------------------------------
Epoch: 60
Weights: [[0.51372637]
 [0.48298288]]
Bias: [0.99648827]
-----------------------------------
Epoch: 80
Weights: [[0.50314529]
 [0.49727586]]
Bias: [0.99803748]
-----------------------------------
Epoch: 100
Weights: [[0.49897413]
 [0.50132876]]
Bias: [0.99923048]
-----------------------------------


(array([[0.49897413],
        [0.50132876]]),
 array([0.99923048]))

# Nesterov Accelerated Gradient Descent

While the idea is similar to Momentum-Based Gradient Descent, here we look ahead if we are overshooting the (local) minima and move accordingly.

This technique helps correct the course of the gradient descent quicker and hence the oscillations are less

In [None]:
def nesterov_accelerated_gradient_descent(X: np.array, y_true: np.array, epochs = 100, momentum_constant = 0.9, learning_rate = 0.1):
    m = X.shape[0]
    W, b = init_weights(n_features)
    update_W, update_b = np.zeros_like(W), 0
    lookahead_W, lookahead_b = np.zeros_like(W), 0

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

        # Calculating the Lookahead Factor
        lookahead_W = W - momentum_constant * update_W
        lookahead_b = b - momentum_constant * update_b

        # Dealing with the history
        update_W = momentum_constant * update_W + learning_rate * lookahead_W
        update_b = momentum_constant * update_b + learning_rate * lookahead_b

        # Update weights and bias
        W -= update_W
        b -= update_b

        # Printing the Progress
        if epoch % 20 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
nesterov_accelerated_gradient_descent(X, y_true, epochs = 100, momentum_constant = 0.9, learning_rate = 0.03)

Epoch: 20
Weights: [[-0.00135337]
 [-0.04299686]]
Bias: -0.26738001365491054
-----------------------------------
Epoch: 40
Weights: [[0.00035027]
 [0.01112821]]
Bias: 0.0692018284807699
-----------------------------------
Epoch: 60
Weights: [[-8.77888665e-05]
 [-2.78907887e-03]]
Bias: -0.017344149420692834
-----------------------------------
Epoch: 80
Weights: [[2.12704603e-05]
 [6.75768966e-04]]
Bias: 0.004202332904855425
-----------------------------------
Epoch: 100
Weights: [[-4.96050421e-06]
 [-1.57596721e-04]]
Bias: -0.000980030038150811
-----------------------------------


(array([[-4.96050421e-06],
        [-1.57596721e-04]]),
 -0.000980030038150811)

# But how can we find the right **hyperparameters**? Line Search Gradient Descent

There are different explicit ways of adjusting these values of hyperparameters like tuning the initial or annealing the learning rate & momentum constant.

We will focus on Line Search which is an algorithm to find the value of the best hyperparameters.

In [None]:
learning_rates = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]

In [None]:
def line_search_gradient_descent(X: np.array, y_true: np.array, learning_rates, epochs = 100, learning_rate = 0.1):
    m = X.shape[0]
    W, b = init_weights(n_features)

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

        min_error = 10000
        best_W, best_b = W, b

        # Trying different learning rates
        for learning_rate in learning_rates:

            # Update weights and bias
            temp_W = W - learning_rate * grad_W
            temp_b = b - learning_rate * grad_b

            # Checking if it is better than minimum error
            temp_error = error(temp_W, temp_b, X, y_true)
            if temp_error < min_error:

                # Updating the 'best' parameters
                best_W = temp_W
                best_b = temp_b
                min_error = temp_error

        W, b = best_W, best_b

        # Printing the Progress
        if epoch % 20 == 0:
            print(f"Epoch: {epoch}")
            print(f"Best Weights: {W}")
            print(f"Best Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
line_search_gradient_descent(X, y_true, learning_rates, epochs = 100, learning_rate = 0.03)

Epoch: 20
Best Weights: [[0.51857639]
 [0.56324472]]
Best Bias: [0.96257252]
-----------------------------------
Epoch: 40
Best Weights: [[0.51214154]
 [0.52646708]]
Best Bias: [0.98227999]
-----------------------------------
Epoch: 60
Best Weights: [[0.50663994]
 [0.51148051]]
Best Bias: [0.99167602]
-----------------------------------
Epoch: 80
Best Weights: [[0.50336833]
 [0.5051078 ]]
Best Bias: [0.99610563]
-----------------------------------
Epoch: 100
Best Weights: [[0.50164564]
 [0.50231116]]
Best Bias: [0.99818209]
-----------------------------------


(array([[0.50164564],
        [0.50231116]]),
 array([0.99818209]))

# Adaptive Learning Rates

Adaptive learning rate algorithms are a class of optimization algorithms commonly used in training machine learning models, particularly in deep learning. These algorithms dynamically adjust the learning rate during the training process based on the gradients observed so far. This adaptiveness allows them to handle varying gradients and converge faster compared to fixed learning rate methods.

Three popular adaptive learning rate algorithms:

1.   Adagrad (Adaptive Gradient Algorithm)
2.   RMSprop (Root Mean Square Propagation)
3.   Adam (Adaptive Moment Estimation)



# Adagrad (Adaptive Gradient Algorithm)

*   Adagrad adapts the learning rates of each parameter individually by scaling them inversely proportional to the square root of the sum of the historical squared gradients.
*   It performs larger updates for infrequent parameters and smaller updates for frequent parameters.
*   The main drawback is that the learning rates can become too small, causing premature convergence and making it less suitable for deep learning models with many parameters.

In [None]:
def adagrad(X: np.array, y_true: np.array, epochs = 100, learning_rate = 0.1, epsilon = 1e-8):
    m = X.shape[0]
    W, b = init_weights(n_features)
    v_W, v_b = np.zeros_like(W), 0

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

        # Dealing with the history
        v_W += np.square(grad_W)
        v_b += grad_b ** 2

        # Update weights and bias
        W -= (learning_rate / (np.sqrt(v_W + epsilon))) * grad_W
        b -= (learning_rate / (np.sqrt(v_b + epsilon))) * grad_b

        # Printing the Progress
        if epoch % 20 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
adagrad(X, y_true, epochs = 100, learning_rate = 0.03, epsilon = 1e-8)

Epoch: 20
Weights: [[0.53025211]
 [0.37069794]]
Bias: [1.04589312]
-----------------------------------
Epoch: 40
Weights: [[0.49850183]
 [0.42039108]]
Bias: [1.03732058]
-----------------------------------
Epoch: 60
Weights: [[0.4920963 ]
 [0.44843158]]
Bias: [1.0273313]
-----------------------------------
Epoch: 80
Weights: [[0.49240405]
 [0.46583618]]
Bias: [1.01917682]
-----------------------------------
Epoch: 100
Weights: [[0.49411794]
 [0.47712323]]
Bias: [1.01319734]
-----------------------------------


(array([[0.49411794],
        [0.47712323]]),
 array([1.01319734]))

# RMSprop (Root Mean Square Propagation)

*   RMSprop addresses the diminishing learning rates problem of Adagrad by using an exponentially decaying average of squared gradients.
*   Instead of accumulating all past squared gradients, it only considers a moving average of recent gradients, which prevents the learning rates from becoming overly small.
*   RMSprop is basically less agressive on the decay.



In [None]:
def rmsprop(X: np.array, y_true: np.array, epochs = 100, learning_rate = 0.1, beta = 0.9, epsilon = 1e-8):
    m = X.shape[0]
    W, b = init_weights(n_features)
    v_W, v_b = np.zeros_like(W), 0

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

        # Dealing with the history
        v_W = (beta * v_W) + ((1 - beta) * np.square(grad_W))
        v_b = (beta * v_b) + ((1 - beta) * (grad_b ** 2))

        # Update weights and bias
        W -= (learning_rate / (np.sqrt(v_W + epsilon))) * grad_W
        b -= (learning_rate / (np.sqrt(v_b + epsilon))) * grad_b

        # Printing the Progress
        if epoch % 20 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
rmsprop(X, y_true, epochs = 100, learning_rate = 0.1, beta = 0.9, epsilon = 1e-8)

Epoch: 20
Weights: [[0.46584914]
 [0.46732211]]
Bias: [1.0323247]
-----------------------------------
Epoch: 40
Weights: [[0.58225138]
 [0.5817491 ]]
Bias: [1.08146301]
-----------------------------------
Epoch: 60
Weights: [[0.55598221]
 [0.55552687]]
Bias: [1.0445546]
-----------------------------------
Epoch: 80
Weights: [[0.56223659]
 [0.56191189]]
Bias: [1.04990612]
-----------------------------------
Epoch: 100
Weights: [[0.56130444]
 [0.56099221]]
Bias: [1.04869945]
-----------------------------------


(array([[0.56130444],
        [0.56099221]]),
 array([1.04869945]))

# Adam (Adaptive Moment Estimation)



*   Adam combines the ideas of momentum optimization and RMSprop. (Best of both Worlds)
*   It maintains both a decaying average of past gradients (like momentum) and a decaying average of past squared gradients (like RMSprop).
*   Adam also incorporates bias correction to account for the initial bias of estimates during the early training stages.
*   It has been widely adopted due to its good performance across various tasks and its ability to handle noisy or sparse gradients.



In [None]:
def adam(X: np.array, y_true: np.array, epochs = 100, learning_rate = 0.1, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-8):
    m = X.shape[0]
    W, b = init_weights(n_features)
    m_W, m_b = np.zeros_like(W), 0
    v_W, v_b = np.zeros_like(W), 0

    for epoch in range(1, epochs + 1):
        grad_W = np.zeros_like(W)
        grad_b = 0

        # Compute gradients
        for i in range(m):
            grad_W_i, grad_b_i = grad(W, b, X[i], y_true[i])
            grad_W += grad_W_i
            grad_b += grad_b_i

        # Dealing with the first moment history
        m_W = (beta_1 * m_W) + ((1 - beta_1) * grad_W)
        m_b = (beta_1 * m_b) + ((1 - beta_1) * grad_b)

        # Dealing with the second moment history
        v_W = (beta_2 * v_W) + ((1 - beta_2) * np.square(grad_W))
        v_b = (beta_2 * v_b) + ((1 - beta_2) * (grad_b ** 2))

        # Bias Correction
        m_w_hat = m_W / (1 - (beta_1 ** (i)))
        m_b_hat = m_b / (1 - (beta_1 ** (i)))

        v_w_hat = v_W / (1 - (beta_2 ** (i)))
        v_b_hat = v_b / (1 - (beta_2 ** (i)))

        # Update weights and bias
        W -= (learning_rate / (np.sqrt(v_w_hat + epsilon))) * m_w_hat
        b -= (learning_rate / (np.sqrt(v_b_hat + epsilon))) * m_b_hat

        # Printing the Progress
        if epoch % 20 == 0:
            print(f"Epoch: {epoch}")
            print(f"Weights: {W}")
            print(f"Bias: {b}")
            print("-----------------------------------")

    return W, b

In [None]:
adam(X, y_true, epochs = 100, learning_rate = 0.1, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-8)

Epoch: 20
Weights: [[0.53514516]
 [0.49306076]]
Bias: [0.99195921]
-----------------------------------
Epoch: 40
Weights: [[0.51211503]
 [0.4784143 ]]
Bias: [1.00549875]
-----------------------------------
Epoch: 60
Weights: [[0.50256948]
 [0.49368503]]
Bias: [1.0015345]
-----------------------------------
Epoch: 80
Weights: [[0.50015296]
 [0.50063332]]
Bias: [0.99943548]
-----------------------------------
Epoch: 100
Weights: [[0.49960328]
 [0.50088987]]
Bias: [0.99970314]
-----------------------------------


(array([[0.49960328],
        [0.50088987]]),
 array([0.99970314]))