## Optimizers


In [1]:
import numpy as np

### Gradient Descent( Batch Gradient Descent)

In [30]:
def gradient_descent_update(parameters, gradients, learning_rate):
    L = int(len(parameters)/2)
    for l in range (1,L+1):
        parameters['W' +str(l)] = parameters['W' + str(l)] - learning_rate*gradients['dW' + str(l)]
        parameters['b' +str(l)] = parameters['b' + str(l)] - learning_rate*gradients['db' +str(l)]
    return parameters   

### Creating Random Mini Batches

In [31]:
def random_minibatches(X_train, Y_train, minibatch_size):
    m = X_train.shape[1]
    # To shuffle X and Y train.
    #np.random.seed(0)
    K = list(np.random.permutation(m))  # k is an array, list() changes an array into a list. https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.permutation.html
    shuffled_X = X_train[:,K]
    shuffled_Y = Y_train[:,K].reshape((1,m))
    
    
    
    minibatches = []
    num_complete_minibatches = int(np.floor(m/minibatch_size))
    for k in range(0, num_complete_minibatches):
        minibatch_X = shuffled_X[:, k*minibatch_size:(k+1)*minibatch_size]
        minibatch_Y = shuffled_Y[:,k*minibatch_size:(k+1)*minibatch_size]
        minibatch = (minibatch_X, minibatch_Y)
        minibatches.append(minibatch)
        
    # end case of mini batch
    if m % minibatch_size != 0:
        minibatch_X = shuffled_X[:,num_complete_minibatches*minibatch_size:m]
        minibatch_Y = shuffled_Y[:,num_complete_minibatches*minibatch_size:m]
        minibatch = (minibatch_X, minibatch_Y)
        minibatches.append(minibatch)
    
    return minibatches    
    
    
    

### Momentum

### Initialize Velocities 

In [32]:
def initialize_velocity(parameters):
    
    L = int((1/2)*len(parameters))
    V = {}
    for l in range(1, L+1):
        V['dW'+ str(l)] = np.zeros(((parameters['W'+str(l)]).shape))
        V['db'+ str(l)] = np.zeros(((parameters['b'+str(l)]).shape))
    
    return V     
        
        

### Update Parameters with Momentum

In [22]:
def momentum_update(parameters, gradients, learning_rate, V, beta1):
    
    L = int((1/2)*len(parameters))
    for l in range(1,L+1):
        V['dW'+str(l)] = beta1*V['dW'+str(l)] + (1-beta1)*gradients['dW'+str(l)]
        V['db'+str(l)] = beta1*V['db'+str(l)] + (1-beta1)*gradients['db'+str(l)]
        
        # parameters update
        parameters['W'+str(l)] = parameters['W'+str(l)] - learning_rate*V['dW'+str(l)]
        parameters['b'+str(l)] = parameters['b'+str(l)] - learning_rate*V['db'+str(l)]
        
    return parameters, V

### RMS Prop

### Initialize Squared Velocities

In [25]:
def initialize_rms_prop(parameters):
    
    L = int((1/2)*len(parameters))
    S = {}
    for l in range(1, L+1):
        S['dW'+ str(l)] = np.zeros(((parameters['W'+str(l)]).shape))
        S['db'+ str(l)] = np.zeros(((parameters['b'+str(l)]).shape))
    
    return S     
        
        

### Update Parameters with RMS Prop

In [26]:
def rms_prop_update(parameters, gradients, learning_rate, S, beta2, epsilon = 1e-8):
    
    L = int((1/2)*len(parameters))
    
    for l in range(1, L+1):
        S['dW'+str(l)] = beta2*S['dW'+str(l)] + (1-beta2)*np.square(gradients['dW'+str(l)])
        S['db' + str(l)] = beta2*S['db' +str(l)] + (1-beta2)*np.square(gradients['db' +str(l)])
        
        #parameters update
        parameters['W'+str(l)] = parameters['W' +str(l)] - (learning_rate/(np.sqrt(S['dW'+str(l)])+ epsilon))*gradients['dW'+str(l)]
        parameters['b'+str(l)] = parameters['b' +str(l)] - (learning_rate/(np.sqrt(S['db'+str(l)])+ epsilon))*gradients['db'+str(l)]
        
    return parameters, S    

### ADAM

### Adam Initialization

In [37]:
def initialize_adam(parameters):
    
    L = int((1/2)*len(parameters))
    V= {}
    S = {}
    for l in range(1, L+1):
        V['dW'+ str(l)] = np.zeros(((parameters['W'+str(l)]).shape))
        V['db'+ str(l)] = np.zeros(((parameters['b'+str(l)]).shape))
        S['dW'+ str(l)] = np.zeros(((parameters['W'+str(l)]).shape))
        S['db'+ str(l)] = np.zeros(((parameters['b'+str(l)]).shape))
        
    return V, S    

### Update Parameters with Adam

In [35]:
def adam_update(parameters, gradients, learning_rate, V, S, t,  beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
    
    L = int((1/2)*len(parameters))
    V_corrected = {}
    S_corrected = {}
    
    for l in range(1, L+1):
        
        V['dW'+str(l)] = beta1*V['dW'+str(l)] + (1-beta1)*gradients['dW'+str(l)]
        V_corrected['dW'+str(l)] = V['dW'+str(l)]/(1- (beta1)**t)
        
        V['db'+str(l)] = beta1*V['db'+str(l)] + (1-beta1)*gradients['db'+str(l)]
        V_corrected['db'+str(l)] = V['db'+str(l)]/(1- (beta1)**t)
        
        S['dW'+str(l)] = beta2*S['dW'+str(l)] + (1-beta2)*np.square(gradients['dW'+str(l)])
        S_corrected['dW'+str(l)] = S['dW'+str(l)]/(1- (beta2)**t)
        
        S['db'+str(l)] = beta2*S['db'+str(l)] + (1-beta2)*np.square(gradients['db'+str(l)])
        S_corrected['db'+str(l)] = S['db'+str(l)]/(1- (beta2)**t)
        
        #parameters update
        parameters['W'+str(l)] = parameters['W'+str(l)] - (learning_rate/(np.sqrt(S_corrected['dW'+str(l)]) + epsilon))*V_corrected['dW'+str(l)]
        parameters['b'+str(l)] = parameters['b'+str(l)] - (learning_rate/(np.sqrt(S_corrected['db'+str(l)]) + epsilon))*V_corrected['db'+str(l)]
    
    return parameters, V, S