In [None]:
import math, copy
import numpy as np
import matplotlib.pyplot as plt
import os

plt.rcParams.update({
    'figure.figsize': (10, 6),
    'font.size': 12,
    'axes.labelsize': 14,
    'axes.titlesize': 16,
    'axes.titleweight': 'bold',
    'axes.grid': True,
    'grid.alpha': 0.3,
    'lines.linewidth': 2,
    'lines.markersize': 8,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
})

# Load our data set
x_train = np.array([1.0, 2.0])   #features
y_train = np.array([300.0, 500.0])   #target value

#Function to calculate the cost
def compute_cost(x, y, w, b):
   
    m = x.shape[0] 
    cost = 0
    
    for i in range(m):
        f_wb = w * x[i] + b
        cost = cost + (f_wb - y[i])**2
    total_cost = 1 / (2 * m) * cost

    return total_cost
"""
    in class we were taugth that the gradien descent was: 
    repeat until convergence{
    w = w - alpha * (sigma * cost/sigma * w)2
    b = b - alpha * (sigma * cost/sigma * b)3
    }
    b and w are updated simultaneously
    the gradient will be defined as:
    (sigma * cost/sigma * w) = 1/m * sum of 0 till m -1 (fw,b(x^(i)) - y^(i)) * x^(i) 4
    (sigma * cost/sigma * b) = 1/m * sum of 0 till m -1 (fw,b(x^(i)) - y^(i)) 5
    it means that you calculate the partial derivatives for all parameters before updating any of the parameters
    for implementation i will use 3 functions:
    compute_gradient = implements equations 4 and 5
    compute_cost which we already implemented
    gradient_descent it`ll utilize both functions above
"""
def compute_gradient(x,y,w,b):
    """
    computes the gradient for linear regression
    Args:
        x(ndarray (m,)): Data, m examples
        y(ndarray (m,)): target values
        w,b(scalar): model parameters
        returns 
        dj_dw (scalar): the gradient of the cost w.r.t the parameters w
        dj_db (scalar): the gradient of the cost w.r.t the parameter b
    """
    #number of training examples
    m = x.shape[0]
    dj_dw = 0
    dj_db = 0

    for i in range(m):
        f_wb = w * x[i] + b
        dj_dw_i = (f_wb - y[i])* x[i]
        dj_db_i = f_wb - y[i]
        dj_dw += dj_dw_i
        dj_db += dj_db_i
    dj_dw = dj_dw/m
    dj_db = dj_db/m
    return dj_dw, dj_db

def gradient_descent(x,y,w_in,b_in, alpha, num_iters, cost_function, gradiente_function):
    """
    Performs gradient descent to fit w,b. Updats w,b by taking 
    num_iters gradient steps with learning rate alpha
    Args:
    x (ndarray (m,)): Data, m examples
    y (ndarray (m,)): target values
    w_in, b_in (scalar): initial values of model parameters
    alpha (float): learning rate
    num_iters (int): number of iterations to run gradient descent
    cost_function: function to call to produce cost
    gradient_function: function to call to produce gradient
    returns
    w (scalar): updated value of parameter after running gradient descent
    b (scalar): updated value of parameter after running gradient descent
    j_history (list): History of cost values
    p_history (list): history of parameters [w,b]
    """
    w = copy.deepcopy(w_in) #avoid modifying global w_in
    # an array to store cost J and w`s at each iteration primarily for graphing later
    J_history = []
    p_history = []
    b = b_in
    w = w_in

    for i in range(num_iters):
        # calculate the gradient and update the parameters using gradient_function
        dj_dw, dj_db = gradiente_function(x,y,w,b)

        #update parameters using equation 3 above
        b = b - alpha * dj_db
        w = w - alpha * dj_dw

        # save cost J at each iteration
        if i < 100000:
            J_history.append( cost_function(x,y,w,b))
            p_history.append([w,b])
        # print cost at every intervals 10 times or as many iterations if < 10
        if i % math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4}: COst {J_history[-1]:0.2e} ", 
                  f"dj_dw: {dj_dw: 0.3e}, dj_db: {dj_db: 0.3e} ",
                  f"w: {w: 0.3e}, b:{b: 0.5e}")
    return w,b,J_history, p_history # return those for graphing
    
# initialize parameters 
w_init = 0
b_init = 0
# some gradient descent settings
iterations = 10000
tmp_alpha = 1.0e-2
# run gradient descent
w_final, b_final, J_hist, p_hist = gradient_descent(x_train, y_train, w_init, b_init, tmp_alpha,iterations, compute_cost, compute_gradient)
print(f"(w,b) found by gradient descent: ({w_final:8.4f}, {b_final:8.4f})")