In [1]:
import numpy as np

In [None]:
class Optimizer_SDG:
    
    def __init__(self,learning_rate=1, learning_rate_decay = 0, momentum = 0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.steps = 0
        self.momentum = momentum
        
    def pre_update_params(self):
        # Update current learning rate if there is a decay function
        if self.learning_rate_decay:
            self.current_learning_rate = self.learning_rate * (1/ (1+self.learning_rate_decay*self.steps))
        
    def update_params(self,layer):
        if self.momentum:
            
            if not hasattr(layer, 'weight_momentums'):
                # Store momentums in the layer
                layer.weight_momentums = np.zeros_like(layer.weights)
                
                layer.bias_momentums = np.zeros_like(layer.biases)

            # Update with momentums
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        
        else:
            # Normal update
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        
        layer.weights +=  weight_updates
        layer.biases +=  bias_updates
    
    def post_update_params(self):
        self.steps += 1

In [None]:
class Optimizer_AdaGrad:
    
    def __init__(self,learning_rate=1, learning_rate_decay = 0,epsilon = 1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.steps = 0
        self.epsilon = epsilon
        
    def pre_update_params(self):
        # Update current learning rate if there is a decay function
        if self.learning_rate_decay:
            self.current_learning_rate = self.learning_rate * (1/ (1+self.learning_rate_decay*self.steps))
        
    def update_params(self,layer):
        if not hasattr(layer, 'weight_cache'):
            # Store caches in the layer
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases **2
        
        layer.weights +=  -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases +=  -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
    
    def post_update_params(self):
        self.steps += 1

In [None]:
class Optimizer_RMSprop:
    
    def __init__(self,learning_rate=0.001, learning_rate_decay = 0,epsilon = 1e-7, rho = 0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.steps = 0
        self.epsilon = epsilon
        self.rho = rho
        
    def pre_update_params(self):
        # Update current learning rate if there is a decay function
        if self.learning_rate_decay:
            self.current_learning_rate = self.learning_rate * (1/ (1+self.learning_rate_decay*self.steps))
        
    def update_params(self,layer):
        if not hasattr(layer, 'weight_cache'):
            # Store caches in the layer
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases ** 2
        
        layer.weights +=  -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases +=  -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
    
    def post_update_params(self):
        self.steps += 1

In [None]:
class Optimizer_Adam:
    
    def __init__(self,learning_rate=0.001, learning_rate_decay = 0,epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.steps = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        
    def pre_update_params(self):
        # Update current learning rate if there is a decay function
        if self.learning_rate_decay:
            self.current_learning_rate = self.learning_rate * (1/ (1+self.learning_rate_decay*self.steps))
        
    def update_params(self,layer):
        if not hasattr(layer, 'weight_cache'):
            # Store momentums and caches in the layer
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
        
        # Update momentums
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1-self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1-self.beta_1) * layer.dbiases
        
        # Corrected momentums (1 - b_1^step)
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.steps + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.steps + 1))
        
        # Update Cache
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases ** 2
        
        # Corrected cache (1 - b_2^step)
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.steps + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.steps + 1))
        
        layer.weights +=  -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases +=  -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
    
    def post_update_params(self):
        self.steps += 1