<a href="https://colab.research.google.com/github/anubhavgupta1/DeepLearning/blob/master/6_Gradient_Descent_algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Publisher : Anubhav Gupta
Contact : anubhav.gupta62@gmail.com, +91- 90418 -28524

**OUTLINE**

1. Modified SN Class
2. Overall setup - What is data, model task
3. Plotting Functions - 3d, Contour
4. Individual algorithms and how they perform



In [0]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.colors
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import animation, rc
import numpy as np
from IPython.display import HTML


In [0]:
my_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("",["red","yellow","green"])


In [0]:
class SN:

  def __init__(self,w_init,b_init,algo):
    self.w = w_init
    self.b = b_init
    self.w_h = []
    self.b_h = []
    self.e_h = []
    self.algo = algo

  def sigmoid(self,x,w=None,b=None):
    if w is None:
      w = self.w
    if b is None:
      b = self.b
    return 1.0/(1.0 + np.exp(-(w*x + b )))
  
  def error(self,X,Y,w=None,b=None):
    if w is None:
      w = self.w
    if b is None:
      b = self.b
    err = 0
    for x,y in zip(X,Y):
      err += 0.5 * (self.sigmoid(x,w,b)-y) ** 2
    return err
  
  def grad_w(self,x,y,w=None,b=None):
    if w is None:
      w = self.w
    if b is None:
      b = self.b
    y_pred = self.sigmoid(x,w,b)
    return (y_pred - y) * y_pred * (1-y_pred) * x
  
  def grad_b(self,x,y,w=None,b=None):
    if w is None:
      w = self.w
    if b is None:
      b = self.b
    y_pred = self.sigmoid(x,w,b)
    return (y_pred - y) * y_pred * (1-y_pred)
  
  def append_log(self):
    self.w_h.append(self.w)
    self.b_h.append(self.b)
    self.e_h.append(self.error(self.X,self.Y)) 
  
  def fit(self, X,Y,epochs =100, eta = 0.01,gamma = 0.9,mini_batch_size = 100, eps= 1e-8, beta = 0.9, beta1 = 0.9,beta2=0.9 ):
    self.w_h = []
    self.b_h = []
    self.e_h = []
    self.X = X
    self.Y = Y

    if self.algo == 'GD':
      for i in range(epochs):
        dw , db = 0 , 0
        for x , y in zip(X,Y):
          dw += self.grad_w(x,y)
          db += self.grad_b(x,y)
        
        m = X.shape[0]
        self.w -= eta * (dw/m)
        self.b -= eta * (db/m)
        self.append_log()

    elif self.algo == 'MOMENTUM':
      v_w_prev, v_b_prev = 0,0
      for i in range(epochs):
        dw , db = 0 , 0
        for x , y in zip(X,Y):
          dw += self.grad_w(x,y)
          db += self.grad_b(x,y)
        
        m = X.shape[0]
        v_w = gamma * v_w_prev + eta * (dw/m)
        v_b = gamma * v_b_prev + eta * (db/m)
        self.w -= v_w
        self.b -= v_b
        v_w_prev = v_w
        v_b_prev = v_b
        self.append_log()

    elif self.algo == 'NAG':
      v_w_prev, v_b_prev = 0,0
      for i in range(epochs):
        dw , db = 0 , 0
        v_w = gamma * v_w_prev  
        v_b = gamma * v_b_prev
        for x , y in zip(X,Y):
          dw += self.grad_w(x,y,self.w - v_w,self.b - v_b)
          db += self.grad_b(x,y,self.w - v_w,self.b - v_b)
        
        m = X.shape[0]
        v_w = gamma * v_w_prev + eta * (dw/m)
        v_b = gamma * v_b_prev + eta * (db/m)
        self.w -= v_w
        self.b -= v_b
        v_w_prev = v_w
        v_b_prev = v_b
        self.append_log()

    elif self.algo == 'MiniBatch':
      for i in range(epochs):
        dw , db = 0 , 0
        points_seen = 0 
        for x , y in zip(X,Y):
          dw += self.grad_w(x,y)
          db += self.grad_b(x,y)
          points_seen += 1
          if points_seen % mini_batch_size == 0:
            self.w -= eta * (dw/mini_batch_size)
            self.b -= eta * (db/mini_batch_size)
            self.append_log()
            dw , db = 0 , 0

    elif self.algo == 'AdaGrad':
      v_w,v_b = 0,0
      for i in range(epochs):
        dw , db = 0 , 0
        for x , y in zip(X,Y):
          dw += self.grad_w(x,y)
          db += self.grad_b(x,y)
        v_w += dw ** 2
        v_b += db ** 2
        m_w = np.sqrt(v_w) + eps
        m_b = np.sqrt(v_b) + eps
        self.w -= eta * (dw/m_w)
        self.b -= eta * (db/m_b)
        self.append_log() 

    elif self.algo == 'RMSProp':
      v_w,v_b = 0,0
      for i in range(epochs):
        dw , db = 0 , 0
        for x , y in zip(X,Y):
          dw += self.grad_w(x,y)
          db += self.grad_b(x,y)
        v_w = beta* v_w + (1 - beta) * (dw ** 2)
        v_b = beta* v_b + (1 - beta) * (db ** 2)
        m_w = np.sqrt(v_w) + eps
        m_b = np.sqrt(v_b) + eps
        self.w -= eta * (dw/m_w)
        self.b -= eta * (db/m_b)
        self.append_log()

    elif self.algo == 'Adam':
      v_w,v_b = 0,0
      m_w,m_b = 0,0
      num_updates = 0
      for i in range(epochs):
        dw , db = 0 , 0
        for x , y in zip(X,Y):
          dw += self.grad_w(x,y)
          db += self.grad_b(x,y)
          num_updates += 1
          m_w = beta1 * m_w + (1 - beta1) * (dw)
          m_b = beta1 * m_b + (1 - beta1) * (db)
          v_w = beta2 * v_w + (1 - beta2) (dw ** 2)
          v_b = beta2 * v_b + (1 - beta2) (db ** 2)
          m_w_c = m_w / (1-np.power(beta1,num_updates))
          m_b_c = m_b / (1-np.power(beta1,num_updates))
          v_w_c = v_w / (1-np.power(beta2,num_updates))
          v_b_c = v_b / (1-np.power(beta2,num_updates))
          n_w = np.sqrt(v_w_c) + eps
          n_b = np.sqrt(v_b_c) + eps
          self.w -= eta * (m_w_c/n_w)
          self.b -= eta * (m_b_c/n_b)
          self.append_log()      



In [0]:
X = np.asarray([3.5, 0.35, 3.2, -2.0, 1.5, -0.5])
Y = np.asarray([0.5, 0.50, 0.5,  0.5, 0.1,  0.3])

#algo = 'RMSProp'
algo = 'Adam'

w_init = -6
b_init = 4.0


epochs = 200
eta = 0.5
gamma = 0.9
mini_batch_size = 6
beta1 = 0.9
beta2 = 0.9

w_min = -7
w_max = 5
eps = 1e-8

b_min = -5
b_max = 5

animation_frames = 20

plot_2d = True
plot_3d = False

#Adgrad and NAG


X = np.asarray([3.5, 0.35, 3.2, -2.0, 1.5, -0.5])


Y = np.asarray([0.5, 0.50, 0.5,  0.5, 0.1,  0.3])


algo = 'NAG'

algo = 'AdaGrad'


w_init = -6
b_init = 4.0

epochs = 200
epochs = 100
eta = 0.5
gamma = 0.9
mini_batch_size = 6

w_min = -7
w_max = 5
eps = 1e-8

b_min = -5
b_max = 5

animation_frames = 20

plot_2d = True
plot_3d = False



#RmsProp and Adagrad


X = np.asarray([3.5, 0.35, 3.2, -2.0, 1.5, -0.5])
Y = np.asarray([0.5, 0.50, 0.5,  0.5, 0.1,  0.3])

algo = 'AdaGrad'
algo = 'RmsProp'


w_init = -6
b_init = 4.0


epochs = 100
eta = 0.1
gamma = 0.9
mini_batch_size = 6

w_min = -7
w_max = 5
eps = 1e-8

b_min = -5
b_max = 5

animation_frames = 20

plot_2d = True
plot_3

RmsProp does a weighted accumulation of old and current updates

#Adam and RmsProp
X = np.asarray([3.5, 0.35, 3.2, -2.0, 1.5, -0.5])
Y = np.asarray([0.5, 0.50, 0.5,  0.5, 0.1,  0.3])

algo = 'RMSProp'
algo = 'Adam'

w_init = -6
b_init = 4.0


epochs = 200
eta = 0.5
gamma = 0.9
mini_batch_size = 6
beta1 = 0.9
beta2 = 0.9

w_min = -7
w_max = 5
eps = 1e-8

b_min = -5
b_max = 5

animation_frames = 20

plot_2d = True
plot_3d = False

Adam Solves the drawback of RMSProp by having momentum component

In [124]:
sn = SN(w_init,b_init,algo)
sn.fit(X,Y,epochs = epochs, eta = eta,gamma=gamma,mini_batch_size=mini_batch_size,eps=eps,beta1=beta1,beta2=beta2)
plt.plot(sn.e_h,'r')
plt.plot(sn.w_h,'b')
plt.plot(sn.b_h,'g')
w_diff = [t - s for t, s in zip(sn.w_h, sn.w_h[1:])]
b_diff = [t - s for t, s in zip(sn.b_h, sn.b_h[1:])]
#plt.plot(w_diff, 'b--')
#plt.plot(b_diff, 'g--')
plt.show()

TypeError: ignored

In [0]:
def plot_animate_3d(i):
  i = int(i*(epochs/animation_frames))
  line1.set_data(sn.w_h[:i+1], sn.b_h[:i+1])
  line1.set_3d_properties(sn.e_h[:i+1])
  line2.set_data(sn.w_h[:i+1], sn.b_h[:i+1])
  line2.set_3d_properties(np.zeros(i+1) - 1)
  title.set_text('Epoch: {: d}, Error: {:.4f}'.format(i, sn.e_h[i]))
  return line1, line2, title

In [0]:
if plot_3d: 
  W = np.linspace(w_min, w_max, 256)
  b = np.linspace(b_min, b_max, 256)
  WW, BB = np.meshgrid(W, b)
  Z = sn.error(X, Y, WW, BB)

  fig = plt.figure(dpi=100) #no. of dots in plot  
  ax = fig.gca(projection='3d')
  surf = ax.plot_surface(WW, BB, Z, rstride=3, cstride=3, alpha=0.5, cmap=cm.coolwarm, linewidth=0, antialiased=False)
  cset = ax.contourf(WW, BB, Z, 25, zdir='z', offset=-1, alpha=0.6, cmap=cm.coolwarm)
  ax.set_xlabel('w')
  ax.set_xlim(w_min - 1, w_max + 1)
  ax.set_ylabel('b')
  ax.set_ylim(b_min - 1, b_max + 1)
  ax.set_zlabel('error')
  ax.set_zlim(-1, np.max(Z))
  ax.view_init (elev=25, azim=-75) # azim = -20
  ax.dist=12  
  title = ax.set_title('Epoch 0')

We want to move from dark red to blue valley

In [0]:
if plot_3d: 
  i = 0
  line1, = ax.plot(sn.w_h[:i+1], sn.b_h[:i+1], sn.e_h[:i+1], color='black',marker='.')
  line2, = ax.plot(sn.w_h[:i+1], sn.b_h[:i+1], np.zeros(i+1) - 1, color='red', marker='.')
  anim = animation.FuncAnimation(fig, func=plot_animate_3d, frames=animation_frames)
  rc('animation', html='jshtml')
  anim

In [0]:
if plot_2d: 
  W = np.linspace(w_min, w_max, 256)
  b = np.linspace(b_min, b_max, 256)
  WW, BB = np.meshgrid(W, b)
  Z = sn.error(X, Y, WW, BB)

  fig = plt.figure(dpi=100)
  ax = plt.subplot(111)
  ax.set_xlabel('w')
  ax.set_xlim(w_min - 1, w_max + 1)
  ax.set_ylabel('b')
  ax.set_ylim(b_min - 1, b_max + 1)
  title = ax.set_title('Epoch 0')
  cset = plt.contourf(WW, BB, Z, 25, alpha=0.6, cmap=cm.bwr)
  plt.show()

In [0]:
def plot_animate_2d(i):
  i = int(i*(epochs/animation_frames))
  line.set_data(sn.w_h[:i+1], sn.b_h[:i+1])
  title.set_text('Epoch: {: d}, Error: {:.4f}'.format(i, sn.e_h[i]))
  return line, title

In [0]:
if plot_2d: 
  i = 0
  line, = ax.plot(sn.w_h[:i+1], sn.b_h[:i+1], color='black',marker='.')
  anim = animation.FuncAnimation(fig, func=plot_animate_2d, frames=animation_frames)
  rc('animation', html='jshtml')
  anim

In [0]:
if algo == 'GD':
  print('algo = {}, eta = {}'.format(algo, eta))
elif algo == 'Momentum' or algo == 'NAG':
  print('algo = {}, eta = {}, gamma = {}'.format(algo, eta, gamma))
elif algo == 'MiniBatch':
  print('algo = {}, eta = {}, batch size = {}'.format(algo, eta, mini_batch_size))
elif algo == 'AdaGrad' or algo == 'RMSProp':
  print('algo = {}, eta = {}, eps = {}'.format(algo, eta, eps))
anim

GD and Momentum Inputs

X = np.asarray([0.5,2.5])

Y = np.asarray([0.2,0.9])

Gradient descent
#(W_init = -2, B_init = -2, epochs = 1000, algo 'GD' )
#(W_init = -3, B_init = 4, epochs = 1000, algo 'GD' )
#(W_init = -5, B_init = 4, epochs = 1000, algo 'GD' )
#(W_init = -4, B_init = 0, epochs = 1000, algo 'GD' )
#(W_init = -4, B_init = 0, epochs = 5000, algo 'GD' )
#(W_init = -4, B_init = 0, epochs = 10000, algo 'GD' )
Will increasing the epochs of training solve the problem of getting stuck at local optima?

Case1: Surely with more epochs the solution will move out of local optima and move to global optima.

Case2 : No, since the gradients are infinitesimal (very small) therefore the weights will not be updated irrespective number of epochs.

Ans : Both cases are possible it entire depends upon the nature of the surface where the solution is stuck. 

Drawback :
1. In regions of low gradients, its not able to reinforce the direction it should move. So, we move to momentum. 


Momentum

Drawback: 
#(W_init = -4, B_init = 0, epochs = 1000, algo 'Momentum' )
1. It causes overshooting with large and continuous increase in gradient along a direction. (overshoot and comeback)

#(W_init = -4, B_init = 0, epochs = 1000, algo 'Momentum', gamma = 0.8 )
2. It controls the speed of momentum


#(W_init = -3, B_init = 4, epochs = 1000, algo 'GD' )
#(W_init = -3, B_init = 4, epochs = 1000, algo 'Momentum' )
3. Overshooting Problem. In case of too many contour lines GD works better than momentum.

NAG 

Inputs:

X = np.asarray([3.5, 0.35, 3.2])

Y = np.asarray([0.49, 0.53, 0.52])

#(W_init = 2.1, B_init = 4.0, epochs = 1000, algo 'Momentum' )
1. In valley Oscillations are very large. 
#(W_init = 2.1, B_init = 4.0, epochs = 1000, algo 'NAG' )
2. Faster convergance and Reduced Overshoots.

GD and MiniBatch GD

Inputs for mini and adagrad

X = np.asarray([3.5, 0.35, 3.2, -2.0, 1.5, -0.5])

Y = np.asarray([0.5, 0.50, 0.5,  0.5, 0.1,  0.3])

mini_batch_size = 1
gamma = 0.9

#(W_init = 2.1, B_init = 4.0, epochs = 1000, algo 'MiniBatch' )
1. Lots of oscillations that filled color green.

All same 

mini_batch_size = 2
2. Lesser magnitude of oscillations or noise



mini_batch_size = 3
3. Lesser magnitude of oscillations or noise



mini_batch_size = 6
4. It becomes case of GD


5. If we make more updates then we need fewer number of epochs.
6. If I have large no. of data that probably may not fit, so it is better to divide data into batches. 


Adagrad

#(W_init = -6, B_init = 4, epochs = 1000, algo 'NAG' )
#(W_init = -6, B_init = 4, epochs = 1000, algo 'AdaGrad' )