Pytorch Tensors are just like numpy arrays, but they can run on GPU   
Pytorch Tensors API looks almost exactly like numpy   
Here we fit a two-layer net using Pytorch Tensors:  

手工计算梯度

In [16]:
import torch

device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # Forward pass:compute predictions and loss
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    loss = (y_pred-y).pow(2).sum()  # L2（均方）损失函数 
    
    # Backward pass: manually compute gradients 反向传播的计算过程再复习复习
    grad_y_pred = 2.0 * (y_pred-y)  # y_pred对loss求导
    grad_w2 = h_relu.t().mm(grad_y_pred)  # t()为转置
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

使用pytorch追踪计算图，自动求导    
创建tensor时，参数 requires_grad = True, 使得PyTorch建立计算图自动求导  

In [19]:
import torch

device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad = True)
w2 = torch.randn(H, D_out, device=device, requires_grad = True)

learning_rate = 1e-6
for t in range(5):
    # PyTorch keeps track of them in the graph
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    loss = (y_pred-y).pow(2).sum() 
    
    loss.backward()  # 求得loss相对于requires_grad = True的参数的偏导
    
    with torch.no_grad():  # 下列代码的计算过程不在计算图中
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()  # 不置零的话grad会累加
        w2.grad.zero_()  # 下划线意味着原地操作，直接改变w2，而不是返回计算结果

PyTorch可以自己定义 autograd function  
自定义autograd function需要给tensors 写 forward 和 backward 静态方法  
ctx作为内部参数，存储forward backward中的数据，且要一一对应  

In [23]:
import torch


class MyReLU(torch.autograd.Function):  # 继承自Function类
    @staticmethod  # 输入Variable，中间计算tensor，输出Variable
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x.clamp(min = 0)
             
    @staticmethod  # 均使用Variable
    def backward(ctx, grad_y):
        x, = ctx.saved_tensors  # 读取参数 
        grad_input = grad_y.clone()
        grad_input[x<0] = 0
        return grad_input
    
def my_relu(x):
    return MyReLU.apply(x)  # Function父类中的apply函数





device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad = True)
w2 = torch.randn(H, D_out, device=device, requires_grad = True)

learning_rate = 1e-6
for t in range(5):
    h = x.mm(w1)
    h_relu = my_relu(h)  # 使用自定义autograd function
    y_pred = h_relu.mm(w2)  
    loss = (y_pred-y).pow(2).sum() 
    
    loss.backward()  
    
    with torch.no_grad():  
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()  
        w2.grad.zero_()  

PyTorch.nn   
更高级的封装   

In [29]:
import torch

device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out))
model.to(device)

learning_rate = 1e-2
for t in range(5):
    y_pred = model(x)  # 整个forward不用手工计算了，直接用torch.nn中封装好的函数，learnable weight在model中`
    loss = torch.nn.functional.mse_loss(y_pred, y)  # loss也使用封装好的
    
    loss.backward()  
    
    with torch.no_grad():  
        for param in model.parameters():  # 遍历torch.nn构建的model的参数
            param -= learning_rate * param.grad
    model.zero_grad()   # 梯度置零

使用optim优化器   
weight的更新就不需要手动设置了，学习率也会自动改变  

In [30]:
import torch

device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out))
model.to(device)
learning_rate = 1e-4

# 一般优先使用Adam
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(5):
    y_pred = model(x) 
    loss = torch.nn.functional.mse_loss(y_pred, y)  
    
    loss.backward()  
    
    optimizer.step()  # 更新参数
    optimizer.zero_grad()  # 梯度置0，等价于model.zero_grad()

继承torch.nn.Module模块来自定义模型，使得自定义的模型可以使用pytorch的autograd

In [35]:
import torch
class ParalleBlock(torch.nn.Module):
    def __init__(self, D_in, D_out):
        super(ParalleBlock, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, D_out)
        self.linear2 = torch.nn.Linear(D_in, D_out)
    def forward(self, x):
        h1 = self.linear1(x)
        h2 = self.linear2(x)
        return (h1*h2).clamp(min=0)
    
device = torch.device('cuda')

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
        ParalleBlock(D_in, H), # 使用自定义的torch.nn.Module
        ParalleBlock(H, H),
        torch.nn.Linear(H, D_out))
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for t in range(5):
    y_pred = model(x) 
    loss = torch.nn.functional.mse_loss(y_pred, y)  
    loss.backward()  
    optimizer.step()  # 更新参数
    optimizer.zero_grad()  # 梯度置0，等价于model.zero_grad()
    

### 读取 custom data
重写Dataset，读数据、改数据    
使用DataLoader，进行minibatching等    


In [None]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

# 继承Dataset，并实现__getitem__() 和 __len__()
class ReadMyDataset(Dataset):
    def __init__(self,数据集路径,怎么划分数据集xx,xxx):
    
    def __getitem__(self, index):
        # 可以对原始数据进行一些处理，比如 Data augmentation、
        # 图像裁剪平移翻转标准化、点云投影等等
        return 这个index的sample，把sample的feature、label等返回
    
    def __len__(self):
        return 一共多少sample

# 获得一个Dataset的实例
my_dataset = ReadMyDataset(xxxxxxx)

# 使用DataLoader对 dataset 进行minibatching、shuffling、multithreading..
data_loader = DataLoader(
                my_dataset, 
                batch_size=args.batch_size, 
                shuffle=, 
                num_workers=, 
                pin_memory=True, 
                sampler=train_sampler)

# 使用数据，xx\xxx试Dataset里__getitem__()返回的数据
for xx,xxx, in data_loader: 
    # model forward\backward


### Pretrained Model
迁移学习 微调


In [36]:
import torch
import torchvision
resnet101 = torchvision.models.resnet101(pretrained=True)

### 可视化
#### Visdom
功能和matplot差不多，但可以远程显示
#### tensorboardX

### 动态计算图
Dynamic Computation Graphs
每一次迭代都要重新构建计算图
缺点是效率低

### 与其它框架的关系
PyTorch适合研究、论文复现，吸收了Caffe2，底层为C++   

Caffe2适合工业部署，底层为C++   

Tensorflow研究、生产都可，且有谷歌的生态、TPU加持，但更新快、文档跟不上，Tensorflow2的诞生让Tensorflow1的代码出了大量问题   

可以用微软的ONNX将A框架训练的模型用于B框架，所以可以将Pytorch的模型用Caffe运行     