In [1]:
import torch 

torch.__version__

'2.5.1'

In [2]:
# Scalar
scalar = torch.tensor(7)
scalar

tensor(7)

In [3]:
scalar.ndim

0

In [4]:
scalar.item()

7

In [5]:
vector = torch.tensor([7, 7])
vector

tensor([7, 7])

In [7]:
random_tensor = torch.rand(size=(3,4))
random_tensor, random_tensor.dtype

(tensor([[0.5473, 0.2097, 0.6984, 0.8675],
         [0.6385, 0.5979, 0.2395, 0.0789],
         [0.8791, 0.2240, 0.9851, 0.4896]]),
 torch.float32)

In [9]:
import torch 

w = torch.tensor([.5,.5], requires_grad=True)
# loss = w_1^2 + 2w_2^2
loss = w[0]**2+2*w[1]**2
opti = torch.optim.SGD([w], lr=0.01)

loss.backward()
print("w的梯度, ", w.grad)
print("before backbard w: ", w)
opti.step()
print("after backward w: ", w)

w的梯度,  tensor([1., 2.])
before backbard w:  tensor([0.5000, 0.5000], requires_grad=True)
after backward w:  tensor([0.4900, 0.4800], requires_grad=True)


In [10]:
# 多更新一次参数，会怎样？
import torch 

w=torch.tensor([0.5, .5], requires_grad=True)
loss = w[0]**2+2*w[1]**2
opti = torch.optim.SGD([w], lr=0.01)

loss.backward()
print("w的梯度, ", w.grad)
print("before backbard w: ", w)
opti.step()
print("after backward w: ", w)
# opti.step()的作用只是机械地再做一次梯度更新
opti.step()
print("after backward w: ", w)

# 实际训练中，是不是这样一步一步地更新参数？
# 并不是。更新完参数后，需要用新的参数跑下一批数据，然后得到新的loss，算出新的梯度。

w的梯度,  tensor([1., 2.])
before backbard w:  tensor([0.5000, 0.5000], requires_grad=True)
after backward w:  tensor([0.4900, 0.4800], requires_grad=True)
after backward w:  tensor([0.4800, 0.4600], requires_grad=True)


In [14]:
# 更新完参数后，让梯度清零，然后重新做梯度计算

import torch 
w = torch.tensor([.5, .5], requires_grad=True)
loss = w[0]**2+2*w[1]**2
opti = torch.optim.SGD([w], lr=0.01)

loss.backward()
print("w的梯度, ", w.grad)
print("before backbard w: ", w)
opti.step()
print("after backward w: ", w)
opti.zero_grad()

# 这里为什么做一次clone操作呢？
loss=w[0].clone()**2+2*w[1].clone()**2
loss.backward()
print("w的梯度, ", w.grad)
print("before backbard w: ", w)
opti.step()
print("after backward w: ", w)
opti.zero_grad()


w的梯度,  tensor([1., 2.])
before backbard w:  tensor([0.5000, 0.5000], requires_grad=True)
after backward w:  tensor([0.4900, 0.4800], requires_grad=True)
w的梯度,  tensor([0.9800, 1.9200])
before backbard w:  tensor([0.4900, 0.4800], requires_grad=True)
after backward w:  tensor([0.4802, 0.4608], requires_grad=True)


In [15]:
# 权重值衰减

import torch 

w = torch.tensor([.5, .5], requires_grad=True)
loss = w[0]**2+2*w[1]**2
opti = torch.optim.SGD([w], lr=0.01, weight_decay=1)

loss.backward()
print("w的梯度, ", w.grad)
print("before backbard w: ", w)
opti.step()
print("after backward 1 step: ", w)
opti.step()
print("after backward 2 step: ", w)

# 注意，打印梯度的时候，你会发现w的梯度没有把权值衰减考虑进去。
# 因为权值衰减是优化器要考虑的事情，而反向传播（求梯度）不关优化器的事情。权值衰减是在优化器阶段做的

w的梯度,  tensor([1., 2.])
before backbard w:  tensor([0.5000, 0.5000], requires_grad=True)
after backward 1 step:  tensor([0.4850, 0.4750], requires_grad=True)
after backward 2 step:  tensor([0.4702, 0.4502], requires_grad=True)


In [16]:
# 更新sgd的momentum

import torch
w = torch.tensor([0.5,0.5],requires_grad=True)
loss = w[0].clone()**2 + 2*w[1].clone()**2
opti = torch.optim.SGD([w],lr=0.01,momentum=0.1)

loss.backward(retain_graph=True)
print("w的梯度:",w.grad)
print("w:",w)

opti.step()
print("w:",w)

opti.step()
print("w:",w)

w的梯度: tensor([1., 2.])
w: tensor([0.5000, 0.5000], requires_grad=True)
w: tensor([0.4900, 0.4800], requires_grad=True)
w: tensor([0.4790, 0.4580], requires_grad=True)


In [17]:
# 使用不同的优化器
# SGD对于每一个参数都使用同一个学习率。
# adagrad则对每一个参数使用不同的学习率：梯度越大，学习率越小；梯度越小，学习率越大

import torch
w = torch.tensor([0.5,0.5],requires_grad=True)
loss = w[0].clone()**2 + 2*w[1].clone()**2
opti = torch.optim.Adagrad([w],lr=0.01,lr_decay=0, weight_decay=0, initial_accumulator_value=0)

loss.backward(retain_graph=True)
print("w的梯度:",w.grad)
print("w:",w)

opti.step()
print("w:",w)

opti.zero_grad()

loss = w[0].clone()**2 + 2*w[1].clone()**2
loss.backward(retain_graph=True)
print("w的梯度:",w.grad)

opti.step()
print("w:",w)

w的梯度: tensor([1., 2.])
w: tensor([0.5000, 0.5000], requires_grad=True)
w: tensor([0.4900, 0.4900], requires_grad=True)
w的梯度: tensor([0.9800, 1.9600])
w: tensor([0.4830, 0.4830], requires_grad=True)
