## two main features
 - an n-dimensional tensor
 - auto differentiation

## 使用numpy实现两层神经网络
x --> z1 --> relu(z1) --> z2

### 基本流程
 - 定义x,y,w1,w2
 - 构建迭代（以下操作均在迭代中）
     - forward:计算y_pred
     - loss:计算loss
     - 计算梯度
     - 更新参数

In [16]:
import numpy as np
N, H, D_in, D_out = 60, 100, 1000, 10
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H,D_out)

epochs = 1000
lr =1e-6

for i in range(epochs):
    y1 = x.dot(w1)
    a1 = np.maximum(y1,0)
    
    y2 = a1.dot(w2)
    loss = np.sum(np.square(y2- y))
    if i%100 == 0:
        print("epoch:{}, loss:{}".format(i,loss))
    
    dy2 = 2*(y2 - y)
    dw2 = a1.T.dot(dy2)
    da1 = dy2.dot(w2.T)
    da1_ = da1.copy()
    da1_[y1<0] = 0
    dw1 = x.T.dot(da1_)
    
    w1 = w1 - lr*dw1
    w2 = w2 - lr*dw2
    

epoch:0, loss:27439754.59211147
epoch:100, loss:377.04451931385836
epoch:200, loss:1.2946316517472183
epoch:300, loss:0.007374963136117815
epoch:400, loss:4.963957402777351e-05
epoch:500, loss:3.578759414149514e-07
epoch:600, loss:2.6760459674019172e-09
epoch:700, loss:2.0504052634223703e-11
epoch:800, loss:1.6019997531989256e-13
epoch:900, loss:1.2737021935012425e-15


## 使用torch的tensor实现
- 和numpy的流程一致

### 基本流程
 - 定义x,y,w1,w2
 - 构建迭代（以下操作均在迭代中）
     - forward:计算y_pred
     - loss:计算loss
     - 计算梯度
     - 更新参数

### 几个函数
 - `torch.randn`
 - `x.mm()`
 - `h.clamp(min = 0)`
 - `h.clone()`
 - `y.pow(2)`
 - `loss.item()`

In [17]:
import torch

In [18]:
dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [30]:
N, H, D_in, D_out = 60, 100, 1000, 10
x = torch.randn(N, D_in,  device = device, dtype=dtype)
y = torch.randn(N, D_out, device = device, dtype = dtype)

w1 = torch.randn(D_in, H,  device = device, dtype=dtype)
w2 = torch.randn(H, D_out, device = device, dtype = dtype)

lr = 1e-6
epochs = 1000
for t in range(epochs):
    h = x.mm(w1)
    h_relu = h.clamp(min = 0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if t%100 == 0:
        print(t,loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 = w1 - lr* grad_w1
    w2 = w2 - lr * grad_w2

0 31675672.0
100 599.44287109375
200 5.669238567352295
300 0.09221171587705612
400 0.002032718388363719
500 0.00016421520558651537
600 4.14446258218959e-05
700 1.7756441593519412e-05
800 1.0123154424945824e-05
900 6.781424872315256e-06


## 3. 自动微分实现两层神经网络

<font color = "#0000dd">自动微分引入后不用再手动计算梯度</font>

### 基本流程
 - 定义x,y,w1,w2
 - 构建迭代（以下操作均在迭代中）
     - forward:计算y_pred
     - loss:`计算loss`变成`定义loss`
     - `计算梯度`变为`loss.backword()`
     - 更新参数 <font color = "#dd0000">更新之后参数梯度清零</font>

### 计算图的说明
 - node --> tensor
 - edge --> function
 - `x.requires_grad = True`

In [33]:
import torch

In [34]:
dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [35]:
N, H, D_in, D_out = 60, 100, 1000, 10
x = torch.randn(N, D_in,  device = device, dtype=dtype)
y = torch.randn(N, D_out, device = device, dtype = dtype)

w1 = torch.randn(D_in, H,  device = device, dtype=dtype, requires_grad = True)
w2 = torch.randn(H, D_out, device = device, dtype = dtype, requires_grad = True)


lr = 1e-6
epochs = 1000
for t in range(epochs):
    ## compute the y_pred directly
    y_pred = x.mm(w1).clamp(min = 0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    ## loss.item get the scalar val in the loss
    if t%100 == 0:
        print(t,loss.item())
        
    ## backward
    loss.backward()
    with torch.no_grad():
        w1 -= lr*w1.grad
        w2 -= lr*w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
    
    
    


0 30176984.0
100 706.0363159179688
200 10.640213966369629
300 0.21382303535938263
400 0.0048640393652021885
500 0.0002953430521301925
600 6.683335959678516e-05
700 2.9312617698451504e-05
800 1.709126081550494e-05
900 1.1490310498629697e-05


## 自定义微分函数

<font color = "#0000dd">自定义自动微分函数,代替上一个cell中的`clamp`函数</font><br/>

#### 定义一个新函数时定义
 - forward: input --> output 类似于y = y(x)
 - backward: 上一步输出的grad --> 经过这个函数新的grad 
 
![定义新函数的自动微分时各个输入输出的关系](./pytorch_autograd_function.jpeg)

### 基本流程与使用自动微分（上一个cell）的方法一样
 - 定义x,y,w1,w2
 - 构建迭代（以下操作均在迭代中）
     - forward:计算y_pred  <font color = "#dd0000">使用自定义的relu函数计算</font>
     - loss:`定义loss`
     - `loss.backword()`
     - 更新参数 <font color = "#dd0000">更新之后参数梯度清零</font>

 - `ctx.save_for_backward`
 - `ctx.saved_tensors`

In [40]:
class CustomRelu(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        """
        args: input
        output: 
        
        """
        ctx.save_for_backward(input)
        return input.clamp(min = 0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input
        

In [41]:
N, H, D_in, D_out = 60, 100, 1000, 10
x = torch.randn(N, D_in,  device = device, dtype=dtype)
y = torch.randn(N, D_out, device = device, dtype = dtype)

w1 = torch.randn(D_in, H,  device = device, dtype=dtype, requires_grad = True)
w2 = torch.randn(H, D_out, device = device, dtype = dtype, requires_grad = True)


lr = 1e-6
epochs = 1000
for t in range(epochs):
    relu = CustomRelu.apply
    ## compute the y_pred directly
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    ## loss.item get the scalar val in the loss
    if t%100 == 0:
        print(t,loss.item())
        
    ## backward
    loss.backward()
    with torch.no_grad():
        w1 -= lr*w1.grad
        w2 -= lr*w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
    
    
    



0 28664512.0
100 316.8251647949219
200 0.9898596405982971
300 0.005892891436815262
400 0.00017391871369909495
500 3.390698111616075e-05
600 1.4100089174462482e-05
700 8.332450306625105e-06
800 5.6630774452059995e-06
900 4.236172117089154e-06


### TensorFlow 静态图
- 动态的图：对不同的输入进行不同的计算
- 静态的图：通过循环实现展开操作

In [42]:
import tensorflow as tf
import numpy as np

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [45]:
N, H, D_in, D_out = 60, 100, 1000, 10
x = tf.placeholder(tf.float32, shape = (None, D_in))
y = tf.placeholder(tf.float32, shape = (None,D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x,w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y - y_pred)**2.0)

grad_w1, grad_w2 = tf.gradients(loss, [w1,w2])
## tf中参数更新是计算图的一部分
lr = 1e-6
new_w1 = w1.assign(w1 - lr*grad_w1)
new_w2 = w2.assign(w2 - lr*grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    
    for t in range(500):
        loss_value, _, _ =sess.run([loss, new_w1, new_w2], feed_dict = {x:x_value, y:y_value})
        if t%100 == 0:
            print(loss_value)

24619446.0
200.91222
0.53667057
0.0024911969
8.858916e-05


## 使用nn模块实现
 - 不用编写层间函数
 - 给定每一层输入与输出的维度即可

### 基本流程
 - 定义x,y
 - 定义模型的结构
 - 定义loss
 - 构建迭代（以下操作均在迭代中）
     - forward:计算y_pred  <font color = "#dd0000">`model(x)`</font>
     -  <font color = "#dd0000">backward之前模型梯度清零</font>
     - `loss.backword()`
     - 更新参数 <font color = "#dd0000">更新之后不必参数梯度清零</font>

`model.zero_grad()`在pytorch中，参数在backward中积累梯度，在backward前清零

In [51]:
import torch 
N, H, D_in, D_out = 60, 100, 1000, 10
x = torch.randn(N, D_in,  device = device, dtype=dtype)
y = torch.randn(N, D_out, device = device, dtype = dtype)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in,H),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out),
)

loss_fn = torch.nn.MSELoss(reduction = "sum")

lr = 1e-6
for t in range(1000):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if t%100 == 0:
        print(t,loss.item())
    model.zero_grad()
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= lr*param.grad


0 640.214599609375
100 589.9837036132812
200 546.4067993164062
300 508.3091735839844
400 474.4449157714844
500 444.0259094238281
600 416.2925720214844
700 391.2334899902344
800 368.28009033203125
900 347.1427001953125


## pytorch 优化器
 - 自动更新参数 以Adam为例
 - `optimizer.setup()`来更新参数

### 基本流程
 - 定义x,y
 - 定义模型的结构
 - 定义loss
 - 定义优化器
 - 构建迭代（以下操作均在迭代中）
     - forward:计算y_pred  <font color = "#dd0000">`model(x)`</font>
     - loss:`定义loss`
     -  <font color = "#dd0000">backward之前模型梯度清零</font>
     - `loss.backword()`
     - 更新参数: `optimizer.setup()`

In [63]:
import torch 
N, H, D_in, D_out = 60, 100, 1000, 10
x = torch.randn(N, D_in,  device = device, dtype=dtype)
y = torch.randn(N, D_out, device = device, dtype = dtype)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in,H),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out),
)

loss_fn = torch.nn.MSELoss(reduction = "sum")

lr = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

for t in range(1000):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if t%100 == 0:
        print(t,loss.item())
            
    #梯度清零
    optimizer.zero_grad()
    
    loss.backward()
    
    ## update parameters
    optimizer.step()



0 603.3372192382812
100 587.5303955078125
200 572.2085571289062
300 557.3739624023438
400 542.9630737304688
500 529.021484375
600 515.567626953125
700 502.46270751953125
800 489.7728576660156
900 477.42999267578125


## 自定义nn模型
 - 通过创建`nn.Moudle`的子类实现新的模型结构
 - 模型结构在`forward`中构造

#### 自定义模型等同于`torch.nn.Sequential`
 - 初始化的时候构造基本层(`torch.nn.Linear`)的实例
 - 在`forward`中构造模型结构（端到端：x经过forward直接得到y）

In [74]:
import torch
class TwoLayerNet(torch.nn.Module):
    def __init__(self,D_in, H, D_out):
        ## 定义基本层
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        构造模型的结构
        args:
            x:输入
        output:
            返回一个计算结果
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [78]:
N, H, D_in, D_out = 60, 100, 1000, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H,D_out)
criterion = torch.nn.MSELoss(reduction= 'sum')
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-4)
for t in range(1000):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    if t%100 == 0:
        print(t,loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 667.10009765625
100 3.46479868888855
200 0.05651705712080002
300 0.002222836948931217
400 0.00012843955482821912
500 9.304566447099205e-06
600 7.719827976870874e-07
700 7.197548512749563e-08
800 9.258323885319442e-09
900 2.3363508905305252e-09


## 控制流(自定义) + 参数分享 
-  中间层的使用次数是随机的

### 基本流程
 - 将自定义的模型替换`torch.nn.Sequential`，其他与使用优化器的流程一致

 - 自定义模型（类）
     - 初始化定义基本层
     - forward定义模型结构
 - 定义x,y
 -  <font color = "#dd0000">实例化自定的模型类</font>
 - loss函数
 - 优化器的定义
 - 构建迭代（以下操作均在迭代中）
     - forward:计算y_pred  <font color = "#dd0000">`model(x)`</font>
     - loss:`定义loss`
     -  <font color = "#dd0000">backward之前模型梯度清零</font>
     - `loss.backword()`
     - 更新参数 <font color = "#dd0000">更新之后不必参数梯度清零</font>

In [65]:
import random
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        # 构造三个用于向前传播的 nn.Linear 实例
        super(DynamicNet, self).__init__()
        self.input_layer = torch.nn.Linear(D_in,H)
        self.middle_layer = torch.nn.Linear(H,H)
        self.output_layer = torch.nn.Linear(H, D_out)
    
    def forward(self,x):
        ## 每一次forward都构造一个动态的计算图，（通过for 或者 if判断语句来控制
        ## 随机选取0~3个中间层
        h_relu = self.input_layer(x).clamp(min = 0)
        for _ in range(random.randint(0,3)):
            h_relu = self.middle_layer(h_relu).clamp(min = 0)
        h_relu = self.output_layer(h_relu)
        return h_relu
    
    
    

In [67]:
N, H, D_in, D_out = 60, 100, 1000, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

##构建模型
model = DynamicNet(D_in, H, D_out)

##选择loss函数
criterion = torch.nn.MSELoss(reduction = 'sum')
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-4, momentum=0.9)
for t in range(1000):
    ##计算预测值
    y_pred = model(x)
    ## 计算loss
    loss = criterion(y_pred, y)
    if t%100==0:
        print(t, loss.item())
    ##梯度清零    
    optimizer.zero_grad()
    ##向后传播
    loss.backward()
    ##更新参数
    optimizer.step()
    


0 618.580810546875
100 21.41787338256836
200 6.948872089385986
300 1.4741227626800537
400 0.9266403913497925
500 2.9530601501464844
600 0.2710525393486023
700 0.32364121079444885
800 0.03936019167304039
900 0.12787111103534698
