# 4、RMSProp

RMSProp: Root Mean Square Propagation 均方根传播

RMSProp 是在 adagrad 的基础上，进一步在学习率的方向上优化

累计平方梯度：
$$ r \leftarrow \lambda r + (1 - \lambda)g^2 $$

权重更新：
$$ w \leftarrow w - \frac{\eta}{\sqrt{r+\delta}} * g $$

其中， $g$为梯度， $r$为累积平方梯度(初始为0)， $\lambda$为衰减系数， $\eta$为学习率， $\delta$为小参数(避免分母为0)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# ---------------------------
# 设置Matplotlib支持中文
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei'] 
plt.rcParams['axes.unicode_minus'] = False 
# ---------------------------

def loss_function(x, y):
    # f(x) = x_1^2 + 10x_2^2
    return x**2 + 10 * y**2

# 初始化参数
x_start, y_start = 40.0, 20.0
# RMSProp 参数
eta = 0.5        # 学习率
decay_rate = 0.9 # 衰减系数 (lambda)
epsilon = 1e-10  # 防止分母为0

num_epochs = 50

# 1. 普通梯度下降 (无RMSProp) - 用于对比
x, y = x_start, y_start
gd_lr = 0.05 
result_gd = [(x,y)]

for i in range(num_epochs):
    grads_x = 2 * x
    grads_y = 20 * y
    
    x -= gd_lr * grads_x
    y -= gd_lr * grads_y
    result_gd.append((x,y))

# 2. RMSProp算法
x_rms, y_rms = x_start, y_start
r_x, r_y = 0, 0 # 累积平方梯度 (Accumulated Squared Gradients)
result_rms = [(x_rms, y_rms)]

for i in range(num_epochs):
    # 计算梯度
    g_x = 2 * x_rms
    g_y = 20 * y_rms
    
    # 累积平方梯度 r <- lambda * r + (1 - lambda) * g^2
    r_x = decay_rate * r_x + (1 - decay_rate) * g_x**2
    r_y = decay_rate * r_y + (1 - decay_rate) * g_y**2
    
    # 参数更新 w <- w - eta / sqrt(r + epsilon) * g
    x_rms -= (eta / np.sqrt(r_x + epsilon)) * g_x
    y_rms -= (eta / np.sqrt(r_y + epsilon)) * g_y
    
    result_rms.append((x_rms, y_rms))

# --- 2D 绘图代码 ---

# 提取轨迹
x_traj_gd = [p[0] for p in result_gd]
y_traj_gd = [p[1] for p in result_gd]

x_traj_rms = [p[0] for p in result_rms]
y_traj_rms = [p[1] for p in result_rms]

# 创建网格
X_range = np.arange(-50, 50, 0.5)
Y_range = np.arange(-30, 30, 0.5)
X, Y = np.meshgrid(X_range, Y_range)
Z = loss_function(X, Y)

plt.figure(figsize=(10, 6))

# 绘制等高线
contour = plt.contour(X, Y, Z, levels=30, cmap='viridis', alpha=0.6)
plt.clabel(contour, inline=True, fontsize=8)

# 绘制轨迹
plt.plot(x_traj_gd, y_traj_gd, 'r--o', label=f'GD (lr={gd_lr})', alpha=0.5, markersize=4)
plt.plot(x_traj_rms, y_traj_rms, 'b-d', label=f'RMSProp (lr={eta}, $\lambda$={decay_rate})', linewidth=2)

# 起点终点
plt.plot(x_start, y_start, 'go', markersize=10, label='Start')
plt.plot(0, 0, 'k*', markersize=15, label='Optimal (0,0)')

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.title(f'RMSProp vs GD 优化轨迹对比\n函数: $f(x)=x_1^2 + 10x_2^2$', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# --- 3D 绘图代码 ---

# 计算轨迹上的Z值
z_traj_gd = [loss_function(p[0], p[1]) for p in result_gd]
z_traj_rms = [loss_function(p[0], p[1]) for p in result_rms]

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# 绘制表面
surf = ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.4, edgecolor='none')

# 绘制轨迹
ax.plot(x_traj_gd, y_traj_gd, z_traj_gd, 'r--', linewidth=1, label='GD Path')
ax.plot(x_traj_rms, y_traj_rms, z_traj_rms, 'b-', linewidth=3, label='RMSProp Path')

# 标记
ax.scatter([x_start], [y_start], [loss_function(x_start, y_start)], c='g', s=60, label='Start', zorder=20)
ax.scatter([0], [0], [0], c='k', marker='*', s=150, label='Optimal', zorder=20)

ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_zlabel('Loss')
ax.set_title('3D View: RMSProp 能够有效地调整学习步伐', fontsize=14)
ax.view_init(elev=30, azim=120) 

fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5, label='Loss')
plt.legend()
plt.show()