## Gradient Descent

In [34]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

# Introduction to Gradient Descent in NLP

Gradient Descent is an optimization algorithm used to minimize a cost function in machine learning models, particularly in NLP tasks. It works by iteratively adjusting model parameters in the direction that reduces the cost function the most.

The general update rule for Gradient Descent is:

$$
\theta^{(t+1)} = \theta^{(t)} - \alpha \frac{\partial J(\theta)}{\partial \theta^{(t)}}
$$

Where:
- The current parameter value is:
  $
  \theta^{(t)}
  $

- The learning rate is:
  $
  \alpha
  $

- The gradient of the cost function with respect to the parameters is:
  $
  \frac{\partial J(\theta)}{\partial \theta^{(t)}}
$

In [None]:
# Let's create our training data 12 pairs {x_i, y_i}
# We'll try to fit the straight line model to these data
data = np.array([[0.03,0.19,0.34,0.46,0.78,0.81,1.08,1.18,1.39,1.60,1.65,1.90],
                 [0.67,0.85,1.05,1.00,1.40,1.50,1.30,1.54,1.55,1.68,1.73,1.60]])

In [None]:
# Let's define our model -- just a straight line with intercept theta[0] and slope theta[1]
def model(theta, x):
    y_pred = theta[0] + theta[1] * x
    return y_pred

In [None]:
# Draw model
def draw_model(data, model, theta, title=None):
    x_model = np.arange(0, 2, 0.01)
    y_model = model(theta, x_model)

    fig, ax = plt.subplots()
    ax.plot(data[0, :], data[1, :], 'bo')  # Plot the data points
    ax.plot(x_model, y_model, 'm-')        # Plot the model (line)
    
    ax.set_xlim([0, 2])
    ax.set_ylim([0, 2])
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_aspect('equal')
    
    if title is not None:
        ax.set_title(title)
    
    plt.show()

How It Works:
- The x_model values are generated from 0 to 2 with small increments (0.01).
- The y_model values are computed using the model function and the current parameters theta.
- The data points and the predicted line are plotted using Matplotlib.

In [None]:
# Initialize the parameters to some arbitrary values and draw the model
theta = np.zeros((2, 1))
theta[0] = 0.6      # Intercept
theta[1] = -0.2     # Slope

# Draw the model with initial parameters
draw_model(data, model, theta, "Initial parameters")

Now let's compute the sum of squares loss for the training data

### Mean Squared Error (MSE) Loss Function

The **Mean Squared Error (MSE)** is a commonly used loss function in regression problems. It measures the average of the squared differences between the predicted values and the actual values.

#### Steps to Compute MSE:

1. **Model Predictions**: Use the model function to compute predictions for the input data ($ \ \text{data\_x} \ $) using the current parameters $ \ \theta \ $.
2. **Squared Differences**: Compute the squared difference between the predicted values ($ \ y_{\text{pred}} \ $) and the actual values ( $ \ y_{\text{true}} \ $).
3. **Sum of Squared Differences**: Sum all these squared differences to get the total loss.

#### MSE Formula:

$$
\text{Loss} = \frac{1}{N} \sum_{i=1}^{N} \left( y_{\text{pred}}^{(i)} - y_{\text{true}}^{(i)} \right)^2
$$

Where:
- $ \ y_{\text{pred}} \ $ is the predicted value from the model.
- $ \ y_{\text{true}} \ $ is the actual value from the dataset.
- $ \ N \ $ is the number of data points.

In [None]:
def compute_loss(data_x, data_y, model, theta):
    """
    Computes Mean Squared Error (MSE) loss between predicted and true values.
    
    Parameters:
    data_x : array-like
        Input feature data (independent variable).
    data_y : array-like
        True output data (dependent variable).
    model : function
        The model that predicts `y` values based on `x` and `theta`.
    theta : array-like
        Model parameters (weights).
    
    Returns:
    loss : float
        The computed MSE loss.
    """
    
    # Step 1: Make predictions using the model
    pred_y = model(theta, data_x)
    
    # Step 2: Compute squared differences between predictions and actual values
    squared_diffs = (pred_y - data_y) ** 2
    
    # Step 3: Sum all squared differences and return as loss
    loss = np.sum(squared_diffs)
    
    return loss

Let's just test that we got that right



In [None]:
loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))
print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 12.367))

Now let's plot the whole loss function

In [None]:
def draw_loss_function(compute_loss, data, model, theta_iters=None):
    # Define pretty colormap
    my_colormap_vals_hex = ('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', 
                            '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', 
                            # Add more color values as needed...
                           )
    
    my_colormap_vals_dec = np.array([int(element, base=16) for element in my_colormap_vals_hex])
    r = np.floor(my_colormap_vals_dec / (256 * 256))
    g = np.floor((my_colormap_vals_dec - r * 256 * 256) / 256)
    b = np.floor(my_colormap_vals_dec - r * 256 * 256 - g * 256)
    my_colormap = ListedColormap(np.vstack((r, g, b)).transpose() / 255.0)

    # Make grid of intercept/slope values to plot
    intercepts_mesh, slopes_mesh = np.meshgrid(np.arange(0.0, 2.0, 0.02), np.arange(-1.0, 1.0, 0.002))
    loss_mesh = np.zeros_like(slopes_mesh)

    # Compute loss for every set of parameters
    for idslope, slope in np.ndenumerate(slopes_mesh):
        loss_mesh[idslope] = compute_loss(data[0, :], data[1, :], model,
                                          np.array([[intercepts_mesh[idslope]], [slope]]))

    fig, ax = plt.subplots()
    fig.set_size_inches(8, 8)
    
    # Contour plot for loss function
    ax.contourf(intercepts_mesh, slopes_mesh, loss_mesh, 256, cmap=my_colormap)
    ax.contour(intercepts_mesh, slopes_mesh, loss_mesh, 40, colors=['#80808080'])

    # Plot theta iterations if provided
    if theta_iters is not None:
        ax.plot(theta_iters[0, :], theta_iters[1, :], 'go-')

    ax.set_ylim([1, -1])
    ax.set_xlabel('Intercept')
    ax.set_ylabel('Slope')
    
    plt.show()

In [None]:
draw_loss_function(compute_loss, data, model)

### Sum of Squares Loss Function:

The sum of squares loss function is given by:

$$
J(\theta) = \sum_{i=1}^{N} \left( y_{\text{pred}}^{(i)} - y_{\text{true}}^{(i)} \right)^2
$$


Where:
- $ \ y_{\text{pred}}^{(i)} = \theta_0 + \theta_1 x^{(i)} \ $ is the predicted value for the $ \ i\ $-th data point.
- $ \ y_{\text{true}}^{(i)} \ $ is the actual value for the \(i\)-th data point.
- $ \ N \ $ is the number of data points.

---

### Gradient Calculation:

We need to compute the partial derivatives of the loss function with respect to $\ \theta_0 \ $ and $ \ \theta_1 \ $ (the intercept and slope, respectively).

#### Derivative with respect to $ \ \theta_0 \ $ (intercept):

$$
\frac{\partial J(\theta)}{\partial \theta_0} = \frac{2}{N} \sum_{i=1}^{N} \left( y_{\text{pred}}^{(i)} - y_{\text{true}}^{(i)} \right)
$$


#### Derivative with respect to $ \ \theta_1 \ $ (slope):

$$
\frac{\partial J(\theta)}{\partial \theta_1} = \frac{2}{N} \sum_{i=1}^{N} \left( y_{\text{pred}}^{(i)} - y_{\text{true}}^{(i)} \right) x^{(i)}
$$


Where:
- $ \ y_{\text{pred}}^{(i)} = \theta_0 + \theta_1 x^{(i)} \ $ is the predicted value for the $ \ i\ $-th data point.
- $ \ x^{(i)} \ $ is the feature value for the $ \ i\ $-th data point.

In [None]:
def compute_gradient(data_x, data_y, theta):
    """
    Computes the gradient of the sum of squares loss function with respect to theta (parameters).
    
    Parameters:
    data_x : array-like
        Input feature data (independent variable).
    data_y : array-like
        True output data (dependent variable).
    theta : array-like
        Model parameters (weights: intercept and slope).
    
    Returns:
    gradient : array-like
        The gradient of the loss function with respect to theta_0 and theta_1.
    """
    
    # Number of data points
    N = len(data_x)
    
    # Model predictions: y_pred = theta[0] + theta[1] * x
    y_pred = theta[0] + theta[1] * data_x
    
    # Compute gradients
    dl_dtheta0 = (2 / N) * np.sum(y_pred - data_y)  # Derivative w.r.t. theta_0 (intercept)
    dl_dtheta1 = (2 / N) * np.sum((y_pred - data_y) * data_x)  # Derivative w.r.t. theta_1 (slope)
    
    # Return the gradient as a column vector
    return np.array([[dl_dtheta0], [dl_dtheta1]])

In [None]:
# Compute the gradient using your function
gradient = compute_gradient(data[0,:], data[1,:], theta)
print("Your gradients: (%3.3f, %3.3f)" % (gradient[0], gradient[1]))

# Approximate the gradients with finite differences
delta = 0.0001

# Compute finite difference approximation for theta_0 (intercept)
dl_dtheta0_est = (compute_loss(data[0,:], data[1,:], model, theta + np.array([[delta], [0]])) - \
                  compute_loss(data[0,:], data[1,:], model, theta)) / delta

# Compute finite difference approximation for theta_1 (slope)
dl_dtheta1_est = (compute_loss(data[0,:], data[1,:], model, theta + np.array([[0], [delta]])) - \
                  compute_loss(data[0,:], data[1,:], model, theta)) / delta

print("Approx gradients: (%3.3f, %3.3f)" % (dl_dtheta0_est, dl_dtheta1_est))

# There might be small differences in the last significant figure because finite gradients is an approximation

In [None]:
def loss_function_1D(dist_prop, data, model, theta_start, search_direction):
    """
    Computes the loss after moving a certain distance along the search direction.
    
    Parameters:
    - dist_prop: The proportion of the search direction to move.
    - data: The dataset (x and y values).
    - model: The model used to make predictions.
    - theta_start: The starting point of theta (parameters).
    - search_direction: The direction along which to search for the minimum loss.
    
    Returns:
    - The computed loss after moving along the search direction.
    """
    return compute_loss(data[0,:], data[1,:], model, theta_start + search_direction * dist_prop)

def line_search(data, model, theta, gradient, thresh=0.00001, max_dist=0.1, max_iter=15, verbose=False):
    """
    Performs a line search to find the optimal step size along the gradient direction.

    Parameters:
    - data: The dataset (x and y values).
    - model: The model used to make predictions.
    - theta: Current parameters (intercept and slope).
    - gradient: The gradient vector (direction of steepest descent).
    - thresh: Threshold for stopping criteria based on distance between points.
    - max_dist: Maximum distance to search along the gradient direction.
    - max_iter: Maximum number of iterations for line search.
    - verbose: If True, prints intermediate steps.

    Returns:
    - The optimal step size found by the line search.
    """
    
    # Initialize four points along the range we are going to search
    a = 0
    b = 0.33 * max_dist
    c = 0.66 * max_dist
    d = 1.0 * max_dist
    n_iter = 0

    # While we haven't found the minimum closely enough
    while np.abs(b - c) > thresh and n_iter < max_iter:
        # Increment iteration counter (just to prevent an infinite loop)
        n_iter += 1
        
        # Calculate all four points
        lossa = loss_function_1D(a, data, model, theta, gradient)
        lossb = loss_function_1D(b, data, model, theta, gradient)
        lossc = loss_function_1D(c, data, model, theta, gradient)
        lossd = loss_function_1D(d, data, model, theta, gradient)

        if verbose:
            print(f'Iter {n_iter}, a={a:.3f}, b={b:.3f}, c={c:.3f}, d={d:.3f}')
            print(f'a {lossa:.6f}, b {lossb:.6f}, c {lossc:.6f}, d {lossd:.6f}')

        # Rule #1 If point A is less than points B, C, and D then halve distance from A to points B,C,D
        if np.argmin((lossa, lossb, lossc, lossd)) == 0:
            b = a + (b - a) / 2
            c = a + (c - a) / 2
            d = a + (d - a) / 2
            continue

        # Rule #2 If point B is less than point C then:
        #                     D becomes C,
        #                     B becomes 1/3 between A and new D,
        #                     C becomes 2/3 between A and new D
        if lossb < lossc:
            d = c
            b = a + (d - a) / 3
            c = a + 2 * (d - a) / 3
            continue

        # Rule #3 If point C is less than point B then:
        #                     A becomes B,
        #                     B becomes 1/3 between new A and D,
        #                     C becomes 2/3 between new A and D
        a = b
        b = a + (d - a) / 3
        c = a + 2 * (d - a) / 3

    # Return average of two middle points as optimal step size
    return (b + c) / 2.0

In [None]:
def gradient_descent_step(theta, data, model):
    """
    Performs one step of gradient descent.

    Parameters:
    - theta: Current parameters (intercept and slope).
    - data: The dataset (x and y values).
    - model: The model used to make predictions.

    Returns:
    - Updated theta after one gradient descent step.
    """

    # Step 1: Compute the gradient (you wrote this function above)
    gradient = compute_gradient(data[0, :], data[1, :], theta)

    # Step 2: Find the best step size alpha using line search function (above)
    # Use negative gradient as we are going downhill
    alpha = line_search(data, model, theta, -gradient)

    # Step 3: Update the parameters theta based on the gradient and step size alpha
    theta = theta + alpha * (-gradient)

    return theta

In [None]:
# Initialize the parameters and draw the model
n_steps = 10
theta_all = np.zeros((2, n_steps + 1))
theta_all[0, 0] = 1.6  # Initial intercept
theta_all[1, 0] = -0.5  # Initial slope

# Measure loss and draw initial model
loss = compute_loss(data[0, :], data[1, :], model, theta_all[:, 0:1])
draw_model(data, model, theta_all[:, 0:1], "Initial parameters, Loss = %f" % (loss))

# Repeatedly take gradient descent steps
for c_step in range(n_steps):
    # Do gradient descent step
    theta_all[:, c_step+1:c_step+2] = gradient_descent_step(theta_all[:, c_step:c_step+1], data, model)
    
    # Measure loss and draw model
    loss = compute_loss(data[0, :], data[1, :], model, theta_all[:, c_step+1:c_step+2])
    draw_model(data, model, theta_all[:, c_step+1], "Iteration %d, loss = %f" % (c_step + 1, loss))

# Draw the trajectory on the loss function
draw_loss_function(compute_loss, data, model, theta_all)