In [1]:
import tensorflow as tf
learning_rate = 10e-4

# 随机梯度下降
- 为了最小化损失函数，梯度下降计算损失函数对参数向量的梯度，并向梯度下降的方向改变参数；
- 随机梯度下降，每一步从训练数据中随机选择一个样本，用来计算梯度
$$\theta\leftarrow\theta-\eta \nabla_\theta J(\theta)$$

In [2]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]

In [4]:
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

# Momentum
- 在梯度下降的基础上加入了动量；每一步将本地梯度添加到动量向量 $m$，加速训练

$$ m\leftarrow\beta m+\eta \nabla_\theta J(\theta)$$
$$\theta\leftarrow\theta-m$$

In [None]:
class Momentum:
    def __init__(self, lr=0.01, momemtum=0.9):
        self.lr = lr
        self.momemtum = momemtum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.keys():
            self.v[key] = self.momemtum * self.v[key] + self.lr * grads[key]
            params[key] -= self.v[key]

In [5]:
optimizer = tf.compat.v1.train.MomentumOptimizer(learning_rate=learning_rate,
                                                 momentum=0.9,
                                                 use_nesterov=True)

# Nestrov
- Nestrov为了加速收敛，提前按照之前的动量走了一步，然后求导后按梯度再走一步
$$m\leftarrow\beta m+\eta \nabla_\theta J(\theta+\beta m)$$
$$\theta\leftarrow\theta-m$$

In [6]:
class Nestrov:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.momentum * self.v[key] - self.lr * grads[key]

# Adagrad
- 自适应优化算法，通过每个参数的历史梯度，动态更新每一个参数的学习率，使得每个参数的更新率都能够逐渐减小。
    - 首先将梯度的平方累加进向量 $s$
    - 然后梯度向量缩小 $\sqrt{s+\epsilon}$，$\epsilon$ 是平滑项，为了避免除零发生，通常设置为 $10^{-10}$ 
    - 在较陡的方向，即梯度大的方向，降低更快
    $$s\leftarrow s+\nabla_\theta J(\theta) \otimes\nabla_\theta J(\theta)$$
$$\theta\leftarrow\theta-\eta\nabla_\theta J(\theta)\oslash\sqrt{s+\epsilon}$$
    - 在训练神经网络时，会过早停止训练
                   
          
- 其具体过程如下，对于任一参数 $w$:
    $$w^{t+1} = w^t - \frac{\eta^t}{\sigma^t} g^t$$
其中：$$\eta^t = \eta/\sqrt{t+1}$$   
$$g^t = \frac{\partial L(\theta ^t)}{\partial w}$$
$$\sigma^t  = \sqrt{\frac{1}{t+1}\sum_{i=0}^{t}(g^i)^2}$$
可得：
     
$\boxed{w^{t+1} = w^t - \frac{\eta}{\sqrt{\sum_{i=0}^{t}(g^i)^2}} g^t}$

即：
    
$w^{1} = w^0 - \frac{\eta}{\sqrt{(g^0)^2}} g^0$
      
$w^{2} = w^1 - \frac{\eta}{\sqrt{[g^0)^2+(g^1)^2]}} g^1$
               
$w^{3} = w^2 - \frac{\eta}{\sqrt{[g^0)^1+(g^1)^2+(g^2)^2]}} g^2$                   


对于二次方程：$y=ax^2+bx+c$，任一点 $x_0$，其最佳步长即为$|x_0+\frac{b}{2a}|$
   
由：$$\left|x_0+\frac{b}{2a}\right| = \left|\frac{2ax_0+b}{2a}\right| = \frac{\left|\frac{\partial y}{\partial x}\right|_{x=x_0}}{\frac{\partial^2y}{\partial x^2}|_{x=x_0}} $$
可得：任意点$x_0$一阶微分越大，二阶微分越小，其最佳步长越大


失函数 $L$ 的二阶微分可以近似表示成一阶微分的和$\sqrt{\sum_{i=0}^{t}(g^i)^2}$。故$\boxed{\frac{\eta}{\sqrt{\sum_{i=0}^{t}(g^i)^2}} g^t}$近似表示每次更新时的最佳步长

In [7]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None

    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)

        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

In [8]:
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.001,
                                        initial_accumulator_value=0.1,
                                        epsilon=1e-07,
                                        name='Adagrad')

# RMSprop
- `AdaGrad` 学习率会不断地衰退，在达到最优解之前学习率就已经太小了
- `RMSprop` 采用了使用指数衰减平均来慢慢丢弃先前的梯度历史，只累计最近迭代时的梯度，能够防止学习率过早地减小
$$s\leftarrow\beta s+(1-\beta)\nabla_\theta J(\theta) \otimes\nabla_\theta J(\theta)$$
$$\theta\leftarrow\theta-\eta\nabla_\theta J(\theta)\oslash\sqrt{s+\epsilon}$$

In [None]:
class RMSprop:
    def __init__(self, lr=0.01, decay_rate=0.99):
        self.lr = lr
        self.decay_rate = decay_rate
        self.h = None

    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)

        for key in params.keys():
            self.h[key] *= self.decay_rate
            self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

In [10]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001,
                            rho=0.9,
                            momentum=0.0,
                            epsilon=1e-07,
                            centered=False,
                            name='RMSprop')

# Adam 
- `Adam` 组合了 `Momentum` 和 `RMSProp` 两种算法思想
$$m\leftarrow\beta_1 m+(1-\beta_1)\nabla_\theta J(\theta)$$
$$s\leftarrow\beta_2 s+(1-\beta_2)\nabla_\theta J(\theta) \otimes\nabla_\theta J(\theta)$$
$$m\leftarrow\frac{m}{1-\beta_1^T}$$
$$s\leftarrow\frac{s}{1-\beta_2^T}$$
$$\theta\leftarrow\theta-\eta m\oslash\sqrt{s+\epsilon}$$

In [13]:
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None

    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)

        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (
            1.0 - self.beta1**self.iter)

        for key in params.keys():
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])

            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     beta_1=0.9,
                                     beta_2=0.999,
                                     epsilon=1e-07,
                                     amsgrad=False,
                                     name='Adam')