pytorch的优化器：管理并更新模型中可学习参数的值，使得模型输出更接近真实标签  
导数：函数在指定坐标轴上的变化率  
梯度：一个向量，方向为方向导数取得最大值的方向  

基本属性  
* defaults:优化器超参数  
* state:参数的缓存，如momentum的缓存  
* param_groups:管理的参数组  
* _step_count:记录更新次数，学习率调整中使用  

基本方法：  
* zero_grad():清空所管理参数的梯度  
pytorch:张量梯度不自动清零  
* step():执行一步更新  
* add_param_group():添加参数组  
* state_dict():获取优化器当前状态信息字典  
* load_state_dict():加载状态信息字典#保存当前状态信息，防止因为意外避免模型终止

In [17]:
import os
import torch
import torch.optim as optim
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)

In [18]:
#code
weight.data 

tensor([[-0.9556, -0.0451],
        [-0.4985,  0.6649]])

In [19]:
optimizer.step()
weight.data

tensor([[-1.0556, -0.1451],
        [-0.5985,  0.5649]])

In [20]:
#zero_grad
weight.data

tensor([[-1.0556, -0.1451],
        [-0.5985,  0.5649]])

In [21]:
optimizer.step()
weight.data, id(optimizer.param_groups[0]['params'][0]), id(weight)

(tensor([[-1.1556, -0.2451],
         [-0.6985,  0.4649]]),
 4758795120,
 4758795120)

In [22]:
weight.grad

tensor([[1., 1.],
        [1., 1.]])

In [23]:
optimizer.zero_grad()
weight.grad

tensor([[0., 0.],
        [0., 0.]])

In [24]:
#add_param_group
optimizer.param_groups

[{'params': [tensor([[-1.1556, -0.2451],
           [-0.6985,  0.4649]], requires_grad=True)],
  'lr': 0.1,
  'momentum': 0,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False}]

In [25]:
w2 = torch.randn((3, 3), requires_grad=True)
optimizer.add_param_group({"params": w2, 'lr': 0.0001})
optimizer.param_groups

[{'params': [tensor([[-1.1556, -0.2451],
           [-0.6985,  0.4649]], requires_grad=True)],
  'lr': 0.1,
  'momentum': 0,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False},
 {'params': [tensor([[ 0.3344, -0.1886, -0.2457],
           [ 1.4255,  0.6913, -0.3347],
           [ 1.0530,  0.7267, -1.8213]], requires_grad=True)],
  'lr': 0.0001,
  'momentum': 0,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False}]

In [26]:
#state_dict
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
opt_state_dict = optimizer.state_dict()
opt_state_dict

{'state': {},
 'param_groups': [{'lr': 0.1,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [4758795120]}]}

In [28]:
for i in range(10):
    optimizer.step()
optimizer.state_dict()

{'state': {4758795120: {'momentum_buffer': tensor([[0., 0.],
           [0., 0.]])}},
 'param_groups': [{'lr': 0.1,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [4758795120]}]}

In [29]:
torch.save(optimizer.state_dict(), os.path.join("optimizer_state_dict.pkl"))

In [40]:
#load state_dict
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
state_dict = torch.load(os.path.join("optimizer_state_dict.pkl"))
optimizer.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.1,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [4758795120]}]}

In [39]:
optimizer.load_state_dict(state_dict)
optimizer.state_dict()

{'state': {4758795120: {'momentum_buffer': tensor([[0., 0.],
           [0., 0.]])}},
 'param_groups': [{'lr': 0.1,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [4758795120]}]}

利用学习率控制更新的步伐

In [34]:
import numpy as np
import matplotlib.pyplot as plt
torch.manual_seed(1)

<torch._C.Generator at 0x10ce9b990>

In [45]:
def func(x_t):
    return torch.pow(2 * x_t, 2)
x = torch.tensor([2.], requires_grad=True)

In [41]:
#plot data
x_t = torch.linspace(-3, 3, 100)
y = func(x_t)
plt.plot(x_t.numpy(), y.numpy(), label='y = 4 * x ^ 2')
plt.grid()
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [46]:
# gradient descent
iter_rec, loss_rec, x_rec = list(), list(), list()
lr = 0.01
max_iteration = 20
for i in range(max_iteration):
    y = func(x)
    y.backward()
    print("Iter:{}, X:{:8}, X.grad:{:8}, loss:{:10}".format(i, x.detach().numpy()[0], x.grad.detach().numpy()[0], y.item()))
    x_rec.append(x.item())
    x.data.sub_(lr * x.grad)
    x.grad.zero_()
    iter_rec.append(i)
    loss_rec.append(y)
plt.subplot(121).plot(iter_rec, loss_rec, '-ro')
plt.xlabel("Iteration")
plt.ylabel("Loss value")
x_t = torch.linspace(-3, 3, 100)
y = func(x_t)
plt.subplot(122).plot(x_t.numpy(), y.numpy(), label="y = 4 * x ^ 2")
plt.grid()
y_rec = [func(torch.tensor(i)).item() for i in x_rec]
plt.subplot(122).plot(x_rec, y_rec, '-ro')
plt.legend()
plt.show()

Iter:0, X:     2.0, X.grad:    16.0, loss:      16.0
Iter:1, X:1.840000033378601, X.grad:14.720000267028809, loss:13.542400360107422
Iter:2, X:1.6928000450134277, X.grad:13.542400360107422, loss:11.462287902832031
Iter:3, X:1.5573760271072388, X.grad:12.45900821685791, loss:9.701680183410645
Iter:4, X:1.432785987854004, X.grad:11.462287902832031, loss:8.211503028869629
Iter:5, X:1.3181631565093994, X.grad:10.545305252075195, loss:6.950216293334961
Iter:6, X:1.2127101421356201, X.grad:9.701681137084961, loss:5.882663726806641
Iter:7, X:1.1156933307647705, X.grad:8.925546646118164, loss:4.979086399078369
Iter:8, X:1.0264378786087036, X.grad:8.211503028869629, loss:4.214298725128174
Iter:9, X:0.9443228244781494, X.grad:7.554582595825195, loss:3.5669822692871094
Iter:10, X:0.8687769770622253, X.grad:6.950215816497803, loss:3.0190937519073486
Iter:11, X:0.7992748022079468, X.grad:6.394198417663574, loss:2.555360794067383
Iter:12, X:0.7353328466415405, X.grad:5.882662773132324, loss:2.162857



In [51]:
#multi learning rate
iteration = 100
num_lr = 10
lr_min, lr_max = 0.01, 0.2
lr_list = np.linspace(lr_min, lr_max, num=num_lr).tolist()
loss_rec = [[] for l in range(len(lr_list))]
iter_rec = list()
for i, lr in enumerate(lr_list):
    x = torch.tensor([2.], requires_grad=True)
    for iter in range(iteration):
        y = func(x)
        y.backward()
        x.data.sub_(lr * x.grad)
        x.grad.zero_()
        loss_rec[i].append(y.item())
for i, loss_r in enumerate(loss_rec):
    plt.plot(range(len(loss_r)), loss_r, label="LR: {}".format(lr_list[i]))
plt.legend()
plt.xlabel("Iterations")
plt.ylabel("Loss value")
plt.show()

利用学习率控制更新的步伐

Momentum（动量，冲量）：结合当前梯度与上一次更新信息，用于当前更新

In [54]:
def exp_w_func(beta, time_list):
    return [(1 - beta) * np.power(beta, exp) for exp in time_list]
beta = 0.9
num_point = 100
time_list = np.arange(num_point).tolist()

In [59]:
weights = exp_w_func(beta, time_list)
plt.plot(time_list, weights, '-ro', label="Beta: {}\n y = B^t * (1-B)".format(beta))
plt.xlabel("time")
plt.ylabel("weight")
plt.legend()
plt.title("exponentially weighted average")
plt.show()
np.sum(weights)

0.9999734386011124

In [61]:
# multi weights
beta_list = [0.98, 0.95, 0.9, 0.8]
w_list = [exp_w_func(beta, time_list) for beta in beta_list]
for i, w in enumerate(w_list):
    plt.plot(time_list, w, label='Beta:{}'.format(beta_list[i]))
    plt.xlabel("time")
    plt.ylabel("weight")
plt.legend()
plt.show()

In [62]:
# SGD momentum
def func(x):
    return torch.pow(2 * x, 2)
iteration = 100
m = 0.9
lr_list = [0.01, 0.03]
momentum_list = list()
loss_rec = [[] for l in range(len(lr_list))]
iter_rec = list()
for i, lr in enumerate(lr_list):
    x = torch.tensor([2.], requires_grad=True)
    momentum = 0. if lr == 0.03 else m
    momentum_list.append(momentum)
    optimizer = optim.SGD([x], lr = lr, momentum = momentum)
    for iter in range(iteration):
        y = func(x)
        y.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_rec[i].append(y.item())
for i, loss_r in enumerate(loss_rec):
    plt.plot(range(len(loss_r)), loss_r, label='LR:{} M:{}'.format(lr_list[i], momentum_list[i]))
plt.legend()
plt.xlabel("Iterations")
plt.ylabel("Loss value")
plt.show()

optim.SGD  
主要参数：
* params:管理的参数组  
* lr:初始学习率  
* momentum:动量系数，ß  
* weight_decay:L2正则化系数  
* nesterov:是否采用NAG