## Gradient Descent

### Batch Gradient Descent
Compute gradients for the whole batch, adjusting theta by minus it against the partial derivative of it with a leanring rate multiplier. The higher the rate the fast, but may result in overshooting. The lower the rate the slower, but may stuck.


### Stochastic Gradient Descent
Picks a random instance of the training set at every step and compute gradients at the step. This is fast due to size for each step, and makes it possible for training large datasets.  However, less regular due to the random nature and cost may go up and down. 

In [6]:
# MSE cost function (root mean square error)
# typically used as error function for regression models
# the goal is to minimize the error function by finding tbe best
# thetas
def MSE(X, Theta, y):
    y_hat = X.dot(Theta)
    MSE = np.sum(np.square(y_hat - y))
    return MSE

# Vectorised version of partial differential equation 
def d_MSE(X_b, theta, y):    
    return 2 / len(X) * X_b.T.dot(X_b.dot(theta) - y)


# Flexible learning rate, decrease over t
def learning_schedule(t, t0 = 5, t1 = 50):
    return t0 / (t + t1)

# stochastic Gradient Descent
# initial hyper parameter to adjust to ensure no zero division and reduce learning rate when epoch is higher
def SGD(n_epochs, X, theta, y, t0 = 5, t1 = 50):   
    m = len(X)
    for epoch in range(n_epochs):
        for i in range(m):        
            random_index = np.random.randint(m)
            xi = X[random_index:random_index + 1]
            yi = y[random_index:random_index + 1]
            
            gradients = d_MSE(xi, theta, yi)
            
            learning_rate = learning_schedule(epoch + i)
            theta = theta - learning_rate * gradients                        
        if epoch % 100 == 0:
            print(f"epoch {epoch}, {MSE(X, theta, y)}, {learning_rate}")                
    return theta

# mini batch gradient descent
# same as stocastic, but mini random batches of a time
def mini_batch_SGD(n_epochs, X, theta, y, batch_size = 1, t0 = 5, t1 = 50):   
    m = len(X)
    for epoch in range(n_epochs):
        for i in range(m):        
            random_index = np.random.randint(m)
            xi = X[random_index:random_index + batch_size]
            yi = y[random_index:random_index + batch_size]
            
            gradients = d_MSE(xi, theta, yi)
            
            learning_rate = t0 / (epoch * m + i + t1)
            theta = theta - learning_rate * gradients                                          
    return theta


In [7]:
###import librarries
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

### - Generate random dataset with default size = 100 and intercept at around 5
### - Adding some noices around the line. 
### - Note the slope and intercept (As described above)
def generate_random_data(size = 100, intercept = 5, noise = True):
    X = 2 * np.random.rand(size, 1)
    y = 3 * X + intercept
    if noise:
      y = y + np.random.rand(size, 1) * intercept / 5
    return X, y

X, y = generate_random_data(size = 200, intercept = 10, noise = True)

X_with_bias = np.c_[np.ones((len(X), 1)), X]
theta = np.random.rand(2, 1) 

print(SGD(1000, X_with_bias, theta, y))
print(mini_batch_SGD(100, X_with_bias, theta, y, 100))

epoch 0, 23588.665477461363, 0.020080321285140562
epoch 100, 269.65263308778145, 0.014326647564469915
epoch 200, 134.26578365722804, 0.011135857461024499
epoch 300, 96.63412773069106, 0.009107468123861567
epoch 400, 82.7996389582145, 0.007704160246533128
epoch 500, 76.35809917559075, 0.006675567423230975
epoch 600, 72.94899607942767, 0.005889281507656066
epoch 700, 71.03906600194541, 0.005268703898840885
epoch 800, 69.90909947928242, 0.004766444232602479
epoch 900, 69.21041203430285, 0.004351610095735422
[[10.77301113]
 [ 3.18765278]]
[[10.79155133]
 [ 3.19731374]]
