# 5、Adam

在 Gradient Descent 的基础上，做了如下几个方面的改进：

1、梯度方面增加了 momentum，使用累积梯度： $v \leftarrow \alpha v + (1 - \alpha)g$

2、同 RMSProp 优化算法一样，对学习率进行优化，使用累积平方梯度： $r \leftarrow \lambda r + (1 - \lambda)g^2$

3、偏差纠正： $\hat{v} = \frac{v}{1-\alpha^t}$, $\hat{r} = \frac{r}{1-\lambda^t}$

再如上3点改进的基础上，权重更新： $w \leftarrow w - \frac{\eta}{\sqrt{\hat{r}+\delta}} * \hat{v}$

<br>

> **<font color="green">| 为啥要偏差纠正</font>**
>
> 第1次更新时， $v_1 \leftarrow \alpha v_0 + (1 - \alpha)g_1$，由于 $v_0$ 的初始是0，且 $\alpha$ (即 $\beta$ )值一般会设置为接近于1，因此 $t$ 较小时， $v$ 的值是偏向于0的

```python
def adam(learning_rate, beta1, beta2, epsilon, var, grad, m, v, t):
    m = beta1 * m + (1 - beta1) * grad
    v = beta2 * v + (1 - beta2) * grad * grad
    m_hat = m / (1 - beta1 ** t)
    v_hat = v / (1 - beta2 ** t)
    var = var - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
    return var, m, v
```

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# ---------------------------
# 设置Matplotlib支持中文
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei'] 
plt.rcParams['axes.unicode_minus'] = False 
# ---------------------------

def loss_function(x, y):
    # f(x) = x_1^2 + 10x_2^2
    return x**2 + 10 * y**2

# 初始化参数
x_start, y_start = 40.0, 20.0
# Adam 参数
lr = 0.5          # 学习率
beta1 = 0.9       # Momentum 参数
beta2 = 0.999     # RMSProp 参数
epsilon = 1e-8

num_epochs = 50

# 1. RMSProp算法 (用于对比)
x_rms, y_rms = x_start, y_start
r_x, r_y = 0, 0
rms_lr = 0.5 
decay_rate = 0.9
result_rms = [(x_rms, y_rms)]

for i in range(num_epochs):
    g_x = 2 * x_rms
    g_y = 20 * y_rms
    
    r_x = decay_rate * r_x + (1 - decay_rate) * g_x**2
    r_y = decay_rate * r_y + (1 - decay_rate) * g_y**2
    
    x_rms -= (rms_lr / np.sqrt(r_x + 1e-10)) * g_x
    y_rms -= (rms_lr / np.sqrt(r_y + 1e-10)) * g_y
    result_rms.append((x_rms, y_rms))

# 2. Adagrad算法 (用于对比)
x_ada, y_ada = x_start, y_start
r_x_ada, r_y_ada = 0, 0
ada_lr = 1.5 
result_ada = [(x_ada, y_ada)]

for i in range(num_epochs):
    g_x = 2 * x_ada
    g_y = 20 * y_ada
    
    r_x_ada += g_x ** 2
    r_y_ada += g_y ** 2
    
    x_ada -= (ada_lr / np.sqrt(r_x_ada + 1e-10)) * g_x
    y_ada -= (ada_lr / np.sqrt(r_y_ada + 1e-10)) * g_y
    result_ada.append((x_ada, y_ada))

# 3. Adam算法
x_adam, y_adam = x_start, y_start
m_x, m_y = 0, 0   # 一阶矩估计 (Momentum)
v_x, v_y = 0, 0   # 二阶矩估计 (RMSProp)
result_adam = [(x_adam, y_adam)]

for t in range(1, num_epochs + 1):
    g_x = 2 * x_adam
    g_y = 20 * y_adam
    
    # Update biased first moment estimate
    m_x = beta1 * m_x + (1 - beta1) * g_x
    m_y = beta1 * m_y + (1 - beta1) * g_y
    
    # Update biased second raw moment estimate
    v_x = beta2 * v_x + (1 - beta2) * g_x**2
    v_y = beta2 * v_y + (1 - beta2) * g_y**2
    
    # Compute bias-corrected first moment estimate
    m_hat_x = m_x / (1 - beta1**t)
    m_hat_y = m_y / (1 - beta1**t)
    
    # Compute bias-corrected second raw moment estimate
    v_hat_x = v_x / (1 - beta2**t)
    v_hat_y = v_y / (1 - beta2**t)
    
    # Update parameters
    x_adam -= lr * m_hat_x / (np.sqrt(v_hat_x) + epsilon)
    y_adam -= lr * m_hat_y / (np.sqrt(v_hat_y) + epsilon)
    
    result_adam.append((x_adam, y_adam))

# --- 2D 绘图代码 ---

# 提取轨迹
x_traj_rms = [p[0] for p in result_rms]
y_traj_rms = [p[1] for p in result_rms]

x_traj_ada = [p[0] for p in result_ada]
y_traj_ada = [p[1] for p in result_ada]

x_traj_adam = [p[0] for p in result_adam]
y_traj_adam = [p[1] for p in result_adam]

# 创建网格 (范围稍微缩小一点，以便看清楚轨迹)
X_range = np.arange(-10, 50, 0.5)
Y_range = np.arange(-10, 30, 0.5)
X, Y = np.meshgrid(X_range, Y_range)
Z = loss_function(X, Y)

plt.figure(figsize=(12, 7))

# 绘制等高线
contour = plt.contour(X, Y, Z, levels=50, cmap='viridis', alpha=0.5)
plt.clabel(contour, inline=True, fontsize=8)

# 绘制轨迹
plt.plot(x_traj_rms, y_traj_rms, 'g--d', label=f'RMSProp', alpha=0.6, markersize=5)
plt.plot(x_traj_ada, y_traj_ada, 'b--o', label=f'Adagrad', alpha=0.4, markersize=5)
plt.plot(x_traj_adam, y_traj_adam, 'r-D', label=f'Adam', linewidth=2, markersize=5)

# 起点终点
plt.plot(x_start, y_start, 'ko', markersize=10, label='Start')
plt.plot(0, 0, 'k*', markersize=15, label='Optimal (0,0)')

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.title(f'Adam vs RMSProp vs Adagrad 优化轨迹对比', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)

# 放大显示 (Zoom In)
plt.xlim(-5, 45)
plt.ylim(-5, 25)

plt.show()

# --- 3D 绘图代码 ---

# 计算轨迹上的Z值
z_traj_rms = [loss_function(p[0], p[1]) for p in result_rms]
z_traj_ada = [loss_function(p[0], p[1]) for p in result_ada]
z_traj_adam = [loss_function(p[0], p[1]) for p in result_adam]

fig = plt.figure(figsize=(14, 9))
ax = fig.add_subplot(111, projection='3d')

# 绘制表面
surf = ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.3, edgecolor='none')

# 绘制轨迹
ax.plot(x_traj_rms, y_traj_rms, z_traj_rms, 'g--', linewidth=2, label='RMSProp')
ax.plot(x_traj_ada, y_traj_ada, z_traj_ada, 'b--', linewidth=2, label='Adagrad')
ax.plot(x_traj_adam, y_traj_adam, z_traj_adam, 'r-', linewidth=3, label='Adam', zorder=10)

# 标记
ax.scatter([x_start], [y_start], [loss_function(x_start, y_start)], c='k', s=60, label='Start')
ax.scatter([0], [0], [0], c='k', marker='*', s=150, label='Optimal')

ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_zlabel('Loss')
ax.set_title('3D View: Adam 结合了 Momentum 和 RMSProp 的优势', fontsize=14)
ax.view_init(elev=35, azim=130) 

fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5, label='Loss')
plt.legend()
plt.show()