Inspired from the following video: https://www.youtube.com/watch?v=Uz3B9fVb4LQ

# 0. Import Python libraries

In [2]:
#system
import sys
import time
#math
import numpy as np
import sympy as sy
#data
import pandas as pd
#vis
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# 1. $f$, $\nabla f$ and $\nabla^2 f$ definition

In [3]:
"""This section enables to express f (objective function), its gradient vector and hessian matrix symbolically and evaluate them at specific coordinates.
To do so, express analititically f by indexing the symbolic variable x. For instance x[0] represent first dimension, x[1] the second and so on.
grad_f_exp and hess_f_exp are only there to automatically differentiate f_exp. To evaluate the f, grad_f or hess_f functions, call them precising the desirated coordinates.
This evaluation use lambdify function and uses "numpy" format.

    Parameters
    ----------
    x : sympy.IndexedBase
        symbolic variable managing indexes. For instance, instead of declaring n variable for the n dimensions (x, y, z, ...) which isn't easily scalable, 
        here variables are automaticly indexed (x_1, x_2, ..., x_n).

    xk : np.array (float)
        Coordinates to evaluate the function.

    Returns
    -------
    numpy.float (scalar or numpy.array)
        Depending on the considered function, return a scalar, a (n, ) vector or a (n, n) array.

    Notes
    ------
    The use of specific function to express f, grad_f and hess_f is to improve scalability in the case of not having analitic expression.
    For instance, when only have a black box model which evaluate f, grad_f (and hess_f). 

    References
    ------
    https://docs.sympy.org/latest/index.html

    Examples
    ------
    >>>x = sy.IndexedBase('x')
    >>>xk = [-4, 3]
    >>>dk = grad_f(x, xk)
    """

def f_exp(x):
    return 3*x[0]**2 + 2*x[1]**2 + 20*sy.cos(x[0])*sy.cos(x[1])+40

def f(x, xk):
    return sy.lambdify(x, f_exp(x), "numpy")(xk)

def grad_f_exp(x, xk):
    return [sy.diff(f_exp(x), x[i]) for i in range(len(xk))]

def grad_f(x, xk):
    lambdify = [sy.lambdify(x, gf, "numpy") for gf in grad_f_exp(x, xk)]
    return np.array([lambdify[i](xk) for i in range(len(xk))])

def hess_f_exp(x, xk):
    return [[sy.diff(g, x[i]) for i in range(len(xk))] for g in grad_f_exp(x, xk)]

def hess_f(x, xk):
    lambdify = [[sy.lambdify(x, gf, "numpy") for gf in Hs] for Hs in hess_f_exp(x, xk)]
    return np.array([[lambdify[i][j](xk) for i in range(len(xk))] for j in range(len(xk))])

# 2. Armijo rule

In [4]:
class Steepest_descent_constant():
    """Steepest_descent_constant is a class implementing the Steepest_descent algorithm using a constant learning rate. It has 4 functions in addition to the __init__ function.
    - fit(): is the main function, ensuring the convergency and implementing the steepest descent algorithm.
    - stop_pos(), stop_grad() and stop_func() are respectively criteria for stopping on position, gradient and function. 
    Using absolute norm to compare all values to epsillon (hyperparameter).

    Notes
    ------
    The class respect a commun pattern for all gradient descent classes. To evaluate descent step call the function fit().

    References
    ------
    J. F. Bonnans, J. C. Gilbert, C. Lemaréchal, C. A. Sagastizábal, 2006. Numerical Optimization: Theoretical and Practical Aspects. https://doi.org/10.5860/choice.41-0357
    https://github.com/scikit-learn/scikit-learn

    Examples
    ------
    >>>gradsteepest = Steepest_descent_constant(x, a, x0, b=b)
    >>>xk, dk, fk = gradsteepest.fit()
    """
    def __init__(
        self,
        x,
        a,
        x0,
        *,
        b=1,
        stop='gradient',                 
        epsilon=.001,
        max_it=50,
        ):
        """
        Parameters
        ----------
        x : sympy.IndexedBase
            symbolic variable managing indexes
        a : float
            hyperparameter defining constante learning rate
        x0 : list(float)
            hyperparameter defining initial point for gradient descent
        b : int, optional
            hyperparameter defining learning rate damping factor, by default 1
        stop : str, optional
            hyperparameter defining stopping criteria ("gradient", "position", "function"), by default 'gradient'
        epsilon : float, optional
            hyperparameter defining stopping criteria threshold, by default .001
        max_it : int, optional
            hyperparameter defining maximum of iteration, by default 50
        """
        self.x = x
        self.a = a
        self.x0 = x0
        self.b = b
        self.stop = stop
        self.epsilon = epsilon
        self.max_it = max_it

        #init
        self.xk = np.array([x0])
        self.fk = np.array([f(x, x0)])
        self.dk = np.array([[np.inf, np.inf]])      #Init at np.inf to ensure not to converge at iteration 1
        
    def stop_pos(Steepest_descent_constant):
       if (np.abs(Steepest_descent_constant.xk[-1]-Steepest_descent_constant.xk[-2]) <= Steepest_descent_constant.epsilon).all():
          print(f'xk converged: {np.abs(Steepest_descent_constant.xk[-1]-Steepest_descent_constant.xk[-2])} < {Steepest_descent_constant.epsilon}')
          return True

    def stop_grad(Steepest_descent_constant):
        if (np.abs(Steepest_descent_constant.dk[-1]-Steepest_descent_constant.dk[-2]) <= Steepest_descent_constant.epsilon).all():
          print(f'grad converged: {np.abs(Steepest_descent_constant.dk[-1]-Steepest_descent_constant.dk[-2])} < {Steepest_descent_constant.epsilon}')
          return True
        
    def stop_func(Steepest_descent_constant):
        if (np.abs(Steepest_descent_constant.fk[-1]-Steepest_descent_constant.fk[-2]) <= Steepest_descent_constant.epsilon).all():
          print(f'func converged: {np.abs(Steepest_descent_constant.fk[-1]-Steepest_descent_constant.fk[-2])} < {Steepest_descent_constant.epsilon}')
          return True
        
    def fit(Steepest_descent_constant):
        """
        First compute new direction at coordinates xk, then compute new position and finally evaluate f at new position.
        Iterate until convergency criteria is met or maximum iteration is reached.

        Returns
        -------
        list(np.array(, n), np.array(, n), np.array (, n)): [xk, dk, fk]
            Returns position, gradient and f value for each iterations.
        """
        it=0
        stop_criterion = {"position": Steepest_descent_constant.stop_pos, 
                            "gradient": Steepest_descent_constant.stop_grad, 
                            "function": Steepest_descent_constant.stop_func}

        #init
        #compute descent direction
        Steepest_descent_constant.dk = np.append(Steepest_descent_constant.dk, [- grad_f(Steepest_descent_constant.x, Steepest_descent_constant.xk[-1])/np.linalg.norm(grad_f(Steepest_descent_constant.x, Steepest_descent_constant.xk[-1]))], axis=0)
        #update new point
        Steepest_descent_constant.xk = np.append(Steepest_descent_constant.xk, [Steepest_descent_constant.x0 + Steepest_descent_constant.a*Steepest_descent_constant.dk[-1]], axis=0)
        #evaluate objectif function
        Steepest_descent_constant.fk = np.append(Steepest_descent_constant.fk, [f(Steepest_descent_constant.x, Steepest_descent_constant.xk[-1])], axis=0)
        
        while (it <= Steepest_descent_constant.max_it):
            Steepest_descent_constant.dk = np.append(Steepest_descent_constant.dk, [- grad_f(Steepest_descent_constant.x, Steepest_descent_constant.xk[-1])/np.linalg.norm(grad_f(Steepest_descent_constant.x, Steepest_descent_constant.xk[-1]))], axis=0)
            Steepest_descent_constant.xk = np.append(Steepest_descent_constant.xk, [Steepest_descent_constant.xk[-1] + Steepest_descent_constant.a*Steepest_descent_constant.b**it*Steepest_descent_constant.dk[-1]], axis=0)
            Steepest_descent_constant.fk = np.append(Steepest_descent_constant.fk, [f(Steepest_descent_constant.x, Steepest_descent_constant.xk[-1])], axis=0)
            
            #convergency test
            if stop_criterion[Steepest_descent_constant.stop]():
                print(f'converged at it: {it}')
                break
                
            it += 1
        print(f'not converged')
        return [Steepest_descent_constant.xk, Steepest_descent_constant.dk, Steepest_descent_constant.fk]

In [5]:
#symbolic variable definition
x = sy.IndexedBase('x')

#Steepest_descent_constant hyperparameters
a = .5              #learning rate
b = .95             #learning rate damping factor 
x0 = [-3, 4]
lim = [[-5, 5], [-5, 5]]

#Steepest_descent_constant fit
start = time.time()
gradsteepest = Steepest_descent_constant(x, a, x0, b=b)
xk, dk, fk = gradsteepest.fit()

#print output
print(f"t: {time.time() - start} s")
print(f"x*: {xk[-1]}")
print(f"grad*: {dk[-1]}")
print(f"f*: {fk[-1]}")

not converged
t: 0.33785390853881836 s
x*: [9.38471883e-04 2.59474606e+00]
grad*: [0.44420502 0.89592517]
f*: 36.38205387986426


# 3. Visualization

In [6]:
#
def visualization(lim, x, xk, fk, x_idx=0, y_idx=1):
    """visualization is a function, using plotly to represente dynamicaly the optimization iterations steps on f. Pandas df are used to facilitate plotly usage.

    Parameters
    ----------
    lim : list(n, 2)
        limite du domaine d'optimisation pour chachune des dimensions
    x : sympy.IndexedBase
        symbolic variable managing indexes.
    xk : nd array(float)
        array of xk iterations
    fk : nd array(float)
        array of evaluation of f at xk iterations
    x_idx : int, optional
        index of the first dimension, by default 0
    y_idx : int, optional
        index of the second dimension, by default 1

    References
    ------
    https://plotly.com/python/

    Examples
    ------
    >>>x_idx = 0
    >>>y_idx = 1
    >>>visualization(lim, x, xk, fk, x_idx, y_idx)
    """
    if x_idx >= np.shape(xk)[1] or y_idx >= np.shape(xk)[1]:
        print('---------- Dimension out of bound, please choose right dimensions ----------')
        sys.exit()
    #Surface plot
    X = np.linspace(lim[0][0], lim[0][1], 100)
    Y = np.linspace(lim[1][0], lim[1][1], 100)
    mesh_X, mesh_Y = np.meshgrid(X, Y)
    Z = f(x,[mesh_X,mesh_Y])

    #gradient descent plot
    grad_desc = pd.DataFrame(xk, columns=['x', 'y'])
    grad_desc['z'] = np.reshape(fk, (-1,1))

    fig = go.Figure()
    fig.add_surface(x=X, y=Y, z=Z)
    fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                    highlightcolor="limegreen", project_z=True))
    fig.update_layout(title='Optimization surface', autosize=False,
                    width=750, height=750
    )
    
    #iterations plot
    fig.add_scatter3d(
        x=grad_desc[grad_desc.columns[x_idx]],
        y=grad_desc[grad_desc.columns[y_idx]],
        z=grad_desc['z'],
        marker=dict(
            size=5,
            color=grad_desc['x'].index,
            colorscale='hot',
        ),
        line=dict(
            color='white',
            width=3
        ))
    
    fig.show()

In [7]:
x_idx = 0
y_idx = 1
visualization(lim, x, xk, fk, x_idx=x_idx, y_idx=y_idx)