# LEARNING PYTORCH WITH EXAMPLES

## 参考资料

> https://pytorch.org/tutorials/beginner/pytorch_with_examples.html  


In [1]:
import torch
print(torch.__version__)

import numpy as np
from collections import OrderedDict
import random

np.random.seed(0)
torch.random.seed()

1.5.0


147127536669100

## 张量

### Numpy: Array


In [2]:
# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# 随机初始化权重
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

# 开始训练模型
learning_rate = 1e-6
for t in range(1, 501):
    # forward
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    # print loss
    loss = np.square(y_pred-y).sum()
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss))
    # backward
    grad_y_pred = 2.*(y_pred-y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    # update weight
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2

100 876.6382
200 10.4804
300 0.1798
400 0.0034
500 0.0001


### PyTorch: Tensor

> `ndarray` -> `tensor`  
>> `ndarray.dot()` -> `tensor.mm()`  
>> `ndarray.maximum()` -> `tensor.clamp()`  
>> `ndarray.T` -> `tensor.t()`  
>> `ndarray.square()` -> `tensor.pow(2)`  
>> `ndarray.copy()` -> `tensor.clone()`  


In [3]:
dtype = torch.float
device = torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 随机初始化权重
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

# 开始训练模型
learning_rate = 1e-6
for t in range(1, 501):
    # forward
    h = x.mm(w1)
    # torch.clamp(input, min, max, out=None) -> Tensor
    # 将input张量每个元素夹紧到[min,max]区间内
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    # print loss
    loss = (y_pred-y).pow(2).sum()
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss.item()))
    # backward
    grad_y_pred = 2.*(y_pred-y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    # update weight
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2

100 585.0891
200 4.3040
300 0.0422
400 0.0007
500 0.0001


## 自动微分

### 张量与自动微分


In [4]:
dtype = torch.float
device = torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 随机初始化权重
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

# 开始训练模型
learning_rate = 1e-6
for t in range(1, 501):
    # forward
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    # print loss
    loss = (y_pred-y).pow(2).sum()
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss.item()))   # `.item()`获取单个元素Tensor的标量值
    # backward
    loss.backward()
    # update weight
    with torch.no_grad():
        w1 -= learning_rate*w1.grad
        w2 -= learning_rate*w2.grad
        # manually zero gradients
        w1.grad.zero_()
        w2.grad.zero_()

100 622.4977
200 4.3769
300 0.0473
400 0.0009
500 0.0001


### 自定义Autograd函数

可通过继承 `torch.autograd.Function` 来实现自定义Autograd函数。


In [5]:
class MyReLU(torch.autograd.Function):
    """自定义ReLU"""

    @staticmethod
    def forward(ctx, inputs):
        ouputs = inputs.clamp(min=0)
        ctx.save_for_backward(inputs)
        return ouputs

    @staticmethod
    def backward(ctx, grad_outputs):
        inputs, = ctx.saved_tensors
        grad_inputs = grad_outputs.clone()
        grad_inputs[inputs<0] = 0
        return grad_inputs


class MyMM(torch.autograd.Function):
    """自定义矩阵乘法"""

    @staticmethod
    def forward(ctx, inputs, weights):
        # N*I I*O  N*O
        ouputs = inputs.mm(weights)
        ctx.save_for_backward(inputs, weights)
        return ouputs

    @staticmethod
    def backward(ctx, grad_outputs):
        inputs, weights = ctx.saved_tensors
        grad_inputs = grad_outputs.mm(weights.t())
        grad_weights = inputs.t().mm(grad_outputs)
        return grad_inputs, grad_weights

In [6]:
dtype = torch.float
device = torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 随机初始化权重
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

# 开始训练模型
learning_rate = 1e-6
for t in range(1, 501):
    # 自定义Autograd函数的使用，调用`Function.apply()`方法
    mm = MyMM.apply
    relu = MyReLU.apply
    # forward
    y_pred = mm(relu(mm(x, w1)), w2)
    # print loss
    loss = (y_pred-y).pow(2).sum()
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss.item()))   # `.item()`获取单个元素Tensor的标量值
    # backward
    loss.backward()
    # update weight
    with torch.no_grad():
        w1 -= learning_rate*w1.grad
        w2 -= learning_rate*w2.grad
        # manually zero gradients
        w1.grad.zero_()
        w2.grad.zero_()

100 433.6183
200 1.8319
300 0.0107
400 0.0002
500 0.0000


## nn.Module

`nn.Module`包定义了一组Modules，大致等效于神经网络的Layers，对神经网络进行更高级别的抽象。  
此外，`nn.Module`包还定义了一组常用的损失函数。  

### PyTorch: nn


In [7]:
dtype = torch.float
device = torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 将神经网络定义为Layers的序列（Sequence）
model = torch.nn.Sequential(OrderedDict([
    ('linear1', torch.nn.Linear(D_in, H)),
    ('relu', torch.nn.ReLU()),
    ('linear2', torch.nn.Linear(H, D_out)),
]))
print(model)

# 定义损失函数为均方差（Mean Squared Error, MSE）
# reduction = 'none'|'mean'|'sum'
loss_fn = torch.nn.MSELoss(reduction='sum')

# 开始训练模型
learning_rate = 1e-4
for t in range(1, 501):
    # forward pass
    y_pred = model(x)
    # compute loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss.item()))
    # zero gradients
    model.zero_grad()
    # backprop
    loss.backward()
    # update weights
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate*param.grad

Sequential(
  (linear1): Linear(in_features=1000, out_features=100, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=100, out_features=10, bias=True)
)
100 1.9401
200 0.0394
300 0.0021
400 0.0002
500 0.0000


### PyTorch: optim


In [8]:
dtype = torch.float
device = torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 将神经网络定义为Layers的序列（Sequence）
model = torch.nn.Sequential(OrderedDict([
    ('linear1', torch.nn.Linear(D_in, H)),
    ('relu', torch.nn.ReLU()),
    ('linear2', torch.nn.Linear(H, D_out)),
]))
print(model)

# 定义损失函数为均方差（Mean Squared Error, MSE）
# reduction = 'none'|'mean'|'sum'
loss_fn = torch.nn.MSELoss(reduction='sum')

# 开始训练模型
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(1, 501):
    # forward pass
    y_pred = model(x)
    # compute loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss.item()))
    # zero gradients
    optimizer.zero_grad()
    # backprop
    loss.backward()
    # update weights
    optimizer.step()

Sequential(
  (linear1): Linear(in_features=1000, out_features=100, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=100, out_features=10, bias=True)
)
100 49.7564
200 0.8396
300 0.0053
400 0.0000
500 0.0000


### 自定义nn.Module

可通过继承 `torch.nn.Module` 来实现自定义Modules。


In [9]:
# 定义一个简单双层网络模型 SimpleNet
class SimpleNet(torch.nn.Module):
    """自定义SimpleNet"""

    def __init__(self):
        super().__init__()
        # 初始化模型，定义好Layers
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        self.relu = torch.nn.ReLU()

    def forward(self, inputs):
        # 重写`.forward()`方法，接收模型输入
        hidden = self.linear1(inputs)
        hidden_relu = self.relu(hidden)
        y_pred = self.linear2(hidden_relu)
        return y_pred

In [10]:
dtype = torch.float
device = torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 将神经网络定义为Layers的序列（Sequence）
model = SimpleNet()
print(model)

# 定义损失函数为均方差（Mean Squared Error, MSE）
# reduction = 'none'|'mean'|'sum'
loss_fn = torch.nn.MSELoss(reduction='sum')

# 开始训练模型
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(1, 501):
    # forward pass
    y_pred = model(x)
    # compute loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss.item()))
    # zero gradients
    optimizer.zero_grad()
    # backprop
    loss.backward()
    # update weights
    optimizer.step()

SimpleNet(
  (linear1): Linear(in_features=1000, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=10, bias=True)
  (relu): ReLU()
)
100 47.0855
200 0.7062
300 0.0043
400 0.0000
500 0.0000


### 控制Flow和权重共享

动态网络模型：由于PyTorch使用的是动态图，因此可以轻易地实现动态网络模型，即模型的网络结构在迭代里可以动态地变化。


In [11]:
class DynamicNet(torch.nn.Module):
    """自定义动态网络"""

    def __init__(self):
        super().__init__()
        self.in_linear = torch.nn.Linear(D_in, H)
        self.mid_linear = torch.nn.Linear(H, H)
        self.out_linear = torch.nn.Linear(H, D_out)
        self.relu = torch.nn.ReLU()

    def forward(self, inputs):
        hidden_relu = self.relu(self.in_linear(inputs))
        # 随机确定0~2个隐层
        for _ in range(random.randint(0, 3)):
            hidden_relu = self.relu(self.mid_linear(hidden_relu))
        y_pred = self.out_linear(hidden_relu)
        return y_pred

In [12]:
dtype = torch.float
device = torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N: batch_size; D_in: dim of input
# H: dim of hidden; D_out: dim of output
N, D_in, H, D_out = 64, 1000, 100, 10

# 生成随机输入输出
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 将神经网络定义为Layers的序列（Sequence）
model = DynamicNet()
print(model)

# 定义损失函数为均方差（Mean Squared Error, MSE）
# reduction = 'none'|'mean'|'sum'
loss_fn = torch.nn.MSELoss(reduction='sum')

# 开始训练模型
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(1, 501):
    # forward pass
    y_pred = model(x)
    # compute loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 0:
        print('%3d %.4f' % (t, loss.item()))
    # zero gradients
    optimizer.zero_grad()
    # backprop
    loss.backward()
    # update weights
    optimizer.step()

DynamicNet(
  (in_linear): Linear(in_features=1000, out_features=100, bias=True)
  (mid_linear): Linear(in_features=100, out_features=100, bias=True)
  (out_linear): Linear(in_features=100, out_features=10, bias=True)
  (relu): ReLU()
)
100 592.9760
200 300.9666
300 122.1668
400 16.4641
500 9.5828
