In [2]:
import torch
from torch import nn

# PyTorch源码
Pytorch源码:https://github.com/pytorch/pytorch

# Scalars
lower-case letters: $x\in R$

In [2]:
# uninitialized
torch.empty(1)

tensor([0.])

In [3]:
# randomly initialized
torch.rand(1)

tensor([0.7591])

In [4]:
torch.zeros(1,dtype=torch.long)

tensor([0])

In [5]:
torch.zeros(1,dtype=torch.float)

tensor([0.])

In [6]:
torch.tensor(1.337)

tensor(1.3370)

In [7]:
# map to Python built-in type(内置类型,例如int,char,float,double)
torch.tensor(1.5).item()

1.5

In [8]:
x=torch.tensor(1.3)
y=torch.tensor(2.7)
x+y

tensor(4.)

In [9]:
x-y

tensor(-1.4000)

In [10]:
x/y

tensor(0.4815)

In [11]:
x*y

tensor(3.5100)

In [12]:
# exponentiation
x**y

tensor(2.0307)

# Vectors
lower-case boldface letters: $\boldsymbol{x}\in \mathbb R^n$

In [13]:
x = torch.tensor([1, 2, 3])
y = torch.tensor([4, 5, 6])
x + y

tensor([5, 7, 9])

In [14]:
x * 2

tensor([2, 4, 6])

In [15]:
x**2

tensor([1, 4, 9])

In [16]:
x * y

tensor([ 4, 10, 18])

In [17]:
x**y

tensor([  1,  32, 729])

In [18]:
x.dot(y)

tensor(32)

In [19]:
# vector outer product
torch.ger(x,y)

tensor([[ 4,  5,  6],
        [ 8, 10, 12],
        [12, 15, 18]])

# Matrices
upper-case boldface letters: $\boldsymbol{X}\in \mathbb R^{m\times n}$

In [20]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]])
y = torch.tensor([[7, 8, 9], [1, 2, 3]])
x + y

tensor([[ 8, 10, 12],
        [ 5,  7,  9]])

In [21]:
# matrices' dim is always 2
x.dim()

2

In [22]:
x.shape

torch.Size([2, 3])

In [23]:
x.numel()

6

In [24]:
x*y

tensor([[ 7, 16, 27],
        [ 4, 10, 18]])

In [25]:
x.t()

tensor([[1, 4],
        [2, 5],
        [3, 6]])

In [26]:
x.mm(y.t())

tensor([[ 50,  14],
        [122,  32]])

# Tensors(附data_types)
upper-case Euler script letters: $\mathscr{X}\in \mathbb{R}^{a_1 \times \cdots \times a_n}$

In [27]:
# 3 layers, each layer the dim is (2*4)
torch.rand(3,2,4)

tensor([[[0.8257, 0.1172, 0.8344, 0.4760],
         [0.0617, 0.5030, 0.8404, 0.8235]],

        [[0.1758, 0.4977, 0.4014, 0.7472],
         [0.8146, 0.0580, 0.2153, 0.2999]],

        [[0.9888, 0.5547, 0.1264, 0.8322],
         [0.5742, 0.6390, 0.6083, 0.1054]]])

In [28]:
# uninitialized
torch.empty(2,3)

tensor([[0.0000e+00, 2.5244e-29, 0.0000e+00],
        [2.5244e-29, 5.6052e-45, 0.0000e+00]])

In [29]:
torch.zeros(2,3)

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [30]:
torch.ones(2,3)

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [31]:
# identity tensor
torch.eye(3,3)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [32]:
torch.full((2,3),7)

tensor([[7., 7., 7.],
        [7., 7., 7.]])

In [33]:
# randomly initialized from [0,1]
torch.rand(2,3)

tensor([[0.5239, 0.2892, 0.9076],
        [0.9661, 0.8985, 0.0231]])

In [34]:
# randomly initialized from N(0,1)
torch.randn(2,3)

tensor([[ 0.2375, -0.3195, -0.4513],
        [-1.2769,  2.3078,  0.1242]])

In [35]:
# the low can be reach, but the high cannot
torch.randint(low=0,high=10,size=(2,3))

tensor([[7, 0, 2],
        [0, 8, 5]])

In [36]:
# random permutation of ints from 0 to n-1
torch.randperm(n=5)

tensor([4, 2, 0, 1, 3])

![title](1.png)

# Numpy Bridge

## Numpy to PyTorch

In [232]:
import numpy as np
n = np.ones(5)
torch.from_numpy(n)

tensor([1., 1., 1., 1., 1.], dtype=torch.float64)

In [234]:
n = np.ones(5)
x=torch.from_numpy(n)
# still point to the same memory!
np.add(n, 1, out=n)
x

tensor([2., 2., 2., 2., 2.], dtype=torch.float64)

## PyTorch to Numpy

In [39]:
x = torch.ones(5)
x.numpy()

array([1., 1., 1., 1., 1.], dtype=float32)

In [40]:
x = torch.ones(5)
n = x.numpy()
# still pointing to the same memory!
x.add_(1)
n

array([2., 2., 2., 2., 2.], dtype=float32)

# Basic Operations
As with scalars, vectors and matrices, we can perform element-wise operations:  +  -  *  **  /

In [41]:
x = torch.rand(2,3,2)
y = torch.rand(2,3,2)
x + y

tensor([[[1.6540, 0.8720],
         [0.7464, 0.6264],
         [0.9001, 1.2874]],

        [[1.2638, 1.6145],
         [0.6562, 0.8704],
         [0.4915, 1.3747]]])

Many operations have aliases(同名物) and in-place(原地) equivalents

In [42]:
z = torch.add(x,y)
# in-place: no extra memory allocation
x.add_(y)
# equal up to predefined tolerance
x.allclose(z)

True

# Tensor Contraction(张量收缩)
Generalization of vector-vector, matrix-vector, matrix-matrix product etc.<br>
$\mathscr{X}\in \mathbb R^{a_1 \times a_2 \times\cdots\times a_{n-1} \times a_n}$<br>
$\mathscr{Y}\in \mathbb R^{a_n \times a_{n+1} \times\cdots\times a_{n+m}}$<br>
$\mathscr{Z} = \mathscr{X} \times \mathscr{Y}\in \mathbb R^{a_1 \times a_2\times\cdots a_{n-1} \times a_{n+1}\times \cdots\times a_{n+m}}$<br>
For high-oder tensors @ is a batch-matrix multiplication (see 'einsum' for actual tensor contraction)!

In [43]:
x = torch.rand(2,3)
y = torch.rand(3)
x @ y

tensor([0.6339, 0.3071])

In [44]:
x = torch.rand(2,4)
y = torch.rand(4,3)
x @ y
# torch.mv(x,y) can calculate the case that x = torch.rand(2,4), y =torch.rand(4)

tensor([[1.1378, 0.5528, 0.9696],
        [1.1279, 1.0920, 1.1578]])

In [45]:
x = torch.rand(2, 3, 4)
y = torch.rand(4, 2)
x @ y
# 可以看出,x的2层2维矩阵和y的二维矩阵分别作了矩阵乘法，所以张量乘法事实上还是转化成了矩阵乘法

tensor([[[1.0894, 1.0698],
         [0.5501, 0.5071],
         [1.1137, 1.1044]],

        [[1.6371, 1.5839],
         [0.8233, 1.1714],
         [1.3226, 1.6934]]])

# Transpose

In [46]:
x = torch.arange(0, 6).view(2, 3)
# arange是按行先排列的
x.t()

tensor([[0, 3],
        [1, 4],
        [2, 5]])

In [47]:
# in low version, there is some mistake about allclose funtion
x.t().allclose(x.transpose(0, 1))

True

In [48]:
x.transpose(0, 1).allclose(x.transpose(1, 0))

True

In [49]:
x = torch.arange(0, 12).view(2, 3, 2)
x

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]]])

In [50]:
x.transpose(1,2)
# 两层不动，每一层代表的矩阵转置

tensor([[[ 0,  2,  4],
         [ 1,  3,  5]],

        [[ 6,  8, 10],
         [ 7,  9, 11]]])

# Sum, Min, Max, and Mean
Important: 关于那一维度操作,则是对那一维度变化，其他维度不变化的元素进行操作,求完之后进行squeeze

In [51]:
x=torch.arange(0,12).view(2,3,2)
x

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]]])

In [52]:
x.sum()

tensor(66)

In [53]:
x.sum(2)
# 按照第二维度求和，即按第二维度变动的元素求和，即按列求和(第0维为高,第一维为行,第二维为列)

tensor([[ 1,  5,  9],
        [13, 17, 21]])

In [54]:
x.min()

tensor(0)

In [55]:
# min(0),对应在每一层中找最小的,但min(1)表示每一列中找最小的，同时min(2)表示在每一行中找最小的
values, indices = x.min(1)
values

tensor([[0, 1],
        [6, 7]])

In [56]:
# indices(索引)
# min(0)对应indices(索引)表示的在每一层找的最小值的位置，min(1),min(2)以此类推
indices

tensor([[0, 0],
        [0, 0]])

In [57]:
x.max()

tensor(11)

In [58]:
values, indices = x.max(0)
values

tensor([[ 6,  7],
        [ 8,  9],
        [10, 11]])

In [59]:
# mean is not defined for Long tensors, so if you use x=x.long(), there will be a 
x = torch.arange(0,12,dtype=torch.float32).view(2,3,2)
x.mean()

tensor(5.5000)

In [60]:
# mean over last dimension, at first, x's dim is 2*3*2, after this step, dim becomes 2*3*1-squeeze-2*3
# x.mean(-1)=x.mean(2)
x.mean(-1)

tensor([[ 0.5000,  2.5000,  4.5000],
        [ 6.5000,  8.5000, 10.5000]])

# View & Reshape
view ans reshape share API<br>
view does not allocate new memory<br>
reshape works with non-contiguous(不连续的) tensors, but can copy memory<br>

In [61]:
# vector: [0,1,2,...,1]
x=torch.arange(0,12)
# viewed as a [2*3*2] tensor
x.view(2,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]]])

In [62]:
# view as a [3*4] matrix
x.view(3,4)

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [63]:
# viewed as a [3*2*2] tensor via
# inferring one unspecified dimension
x.view(-1,2,2)

tensor([[[ 0,  1],
         [ 2,  3]],

        [[ 4,  5],
         [ 6,  7]],

        [[ 8,  9],
         [10, 11]]])

# Squeeze(压缩) & Unsqueeze(解压缩)
unsqueeze(i)使得原矩阵维度加1，比如原来是0维(1维)的，变成1维向量(2维矩阵),并且在第i个维度上填上1

In [64]:
x=torch.rand(3,1,2,1)
x

tensor([[[[0.4494],
          [0.4825]]],


        [[[0.0557],
          [0.7309]]],


        [[[0.8156],
          [0.0822]]]])

In [65]:
# squeeze function can remove all singleton dimensions
x.squeeze()

tensor([[0.4494, 0.4825],
        [0.0557, 0.7309],
        [0.8156, 0.0822]])

In [66]:
x=torch.rand(3)
x

tensor([0.1904, 0.9238, 0.8055])

In [67]:
# unsqueeze fucntion can expand it to 1*3 matrix / row-vector
x.unsqueeze(dim=0)

tensor([[0.1904, 0.9238, 0.8055]])

In [68]:
# unsqueeze in some case equiv.: expand using None indexing
y=x[None,:]
y

tensor([[0.1904, 0.9238, 0.8055]])

In [69]:
# equiv.: using view(here -1 can be infer by python, can is indeed 1)
z=x.view(-1,3)
y.allclose(z)

True

In [70]:
x=torch.arange(0,12).view(2,3,2)
y=torch.rand(6,2)

In [71]:
# x.view(y.size())=x.view_as(y)
x.view_as(y)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11]])

In [72]:
x.view(y.size())

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11]])

# Expand & Repeat
expand and reshape share API<br>
reshape allocates new memory<br>
expand does not<br>
if possible, use expand but watch out for side effects<br>

In [73]:
x=torch.arange(0,3)
x=x.unsqueeze(1)
x

tensor([[0],
        [1],
        [2]])

In [74]:
#returns an expanded view of x, -1 is refered to be 3
x=x.expand(-1,4)
x

tensor([[0, 0, 0, 0],
        [1, 1, 1, 1],
        [2, 2, 2, 2]])

In [75]:
# 改一个expand后的矩阵的数字，相当于改一个expand之前的数字
x[0,1]=7
x

tensor([[7, 7, 7, 7],
        [1, 1, 1, 1],
        [2, 2, 2, 2]])

In [76]:
x=torch.arange(0,3)
x=x.unsqueeze(1)
x=x.repeat(1,4)
x

tensor([[0, 0, 0, 0],
        [1, 1, 1, 1],
        [2, 2, 2, 2]])

In [77]:
# 改一个repeat后的矩阵的数字,不相当于改一个repeat之前矩阵的数字
x[0,1]=7
x

tensor([[0, 7, 0, 0],
        [1, 1, 1, 1],
        [2, 2, 2, 2]])

In [78]:
x=torch.arange(0,6).unsqueeze(1)
y=torch.rand(6,3)
x.expand_as(y)

tensor([[0, 0, 0],
        [1, 1, 1],
        [2, 2, 2],
        [3, 3, 3],
        [4, 4, 4],
        [5, 5, 5]])

# Indexing & Advanced indexing

## Indexing

In [79]:
x=torch.arange(0,24).view(2,3,4)
x

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [80]:
x[1]

tensor([[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]])

In [81]:
x[1,0]

tensor([12, 13, 14, 15])

In [82]:
x[1,0,3]

tensor(15)

## Advanced indexing

In [83]:
x

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [84]:
# 1:表示下标1到这个这个维度的最后一个，1:-1表示下标1到这个维度的倒数第二个，最后一个到不了
x[1,1:,1:-1]

tensor([[17, 18],
        [21, 22]])

In [85]:
#x[:,indices]=x[:,indices,:], 但indices后面的:可以省略
#这里表示抽原来第一个维度为0,0,2,1的行排列起来
indices =torch.tensor([0,0,2,1])
x[:,indices]

tensor([[[ 0,  1,  2,  3],
         [ 0,  1,  2,  3],
         [ 8,  9, 10, 11],
         [ 4,  5,  6,  7]],

        [[12, 13, 14, 15],
         [12, 13, 14, 15],
         [20, 21, 22, 23],
         [16, 17, 18, 19]]])

# Gather
gather的两个输入x和index的矩阵大小必须一致

![title](2.png)

In [86]:
x=torch.tensor([[1,2],[3,4]])
index=torch.tensor([[0,0],[1,0]])
# [0,0]->[0,0],[0,1]->[0,0],[1,0]->[1,1],[1,1]->[1,0]
torch.gather(x,1,index)

tensor([[1, 1],
        [4, 3]])

In [87]:
x=torch.tensor([[1,2],[3,4]]) # equiv: y=torch.arange(1,5).view(2,2)
index=torch.tensor([[0,0],[1,0]])
# [0,0]->[0,0],[0,1]->[0,0],[1,0]->[1,0],[1,1]->[0,1]
torch.gather(x,0,index)

tensor([[1, 2],
        [3, 2]])

In [88]:
x=torch.arange(0,8).view(2,2,2)
x

tensor([[[0, 1],
         [2, 3]],

        [[4, 5],
         [6, 7]]])

In [89]:
index=torch.tensor([
    [[0,1],[1,0]],
    [[0,0],[1,0]]
])
# [0,0,0]->[0,0,0],[0,0,1]->[0,1,1],[0,1,0]->[0,1,0],[0,1,1]->[0,0,1]
torch.gather(x,1,index)

tensor([[[0, 3],
         [2, 1]],

        [[4, 5],
         [6, 5]]])

# Cat, Split, Chunk, and Stack

## Cat(catenate的缩写,表示连接)

In [90]:
x=torch.arange(0,6).view(3,2)
x

tensor([[0, 1],
        [2, 3],
        [4, 5]])

In [91]:
# dim=1表示cat函数将x按列排列起来
torch.cat([x,x],dim=1)

tensor([[0, 1, 0, 1],
        [2, 3, 2, 3],
        [4, 5, 4, 5]])

## Split(拆分)

In [92]:
x=torch.arange(0,24).view(4,6)
x

tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22, 23]])

In [93]:
# dim=1表示split函数将x按照一个个列看待，3表示3列为一个新的tensor
torch.split(x,3,dim=1)[1]

tensor([[ 3,  4,  5],
        [ 9, 10, 11],
        [15, 16, 17],
        [21, 22, 23]])

## Chunk(块)

In [94]:
# dim=1表示chunk函数将x按照一个个列看待，2表示分成2个新tensor
# 可以看出torch.chunk(x,2,dim=1)[1]=torch.split(x,3,dim=1)[1]
torch.chunk(x,3,dim=1)[1]

tensor([[ 2,  3],
        [ 8,  9],
        [14, 15],
        [20, 21]])

## Stack(叠)

In [95]:
x=torch.arange(0,6).view(2,3)
y=torch.arange(6,12).view(2,3)
# stack将tensor叠起来,几个叠起来就在最前面乘几,叠起来的东西必须维度一致
torch.stack([x,y,x])

tensor([[[ 0,  1,  2],
         [ 3,  4,  5]],

        [[ 6,  7,  8],
         [ 9, 10, 11]],

        [[ 0,  1,  2],
         [ 3,  4,  5]]])

In [96]:
A=torch.arange(20).view(5,4)
print(A)
b=torch.arange(4)
print(b)

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])
tensor([0, 1, 2, 3])


# Broadcasting
Two tensors are “broadcastable” if each tensor has at least one dimension and when iterating over the dimension sizes, starting at the trailing dimension, the dimension sizes must either be equal, one of them is 1, or one of them does not exist.

In [97]:
x = torch.arange(0,6).view(2,3)
y=torch.tensor([[-3],[3]])
print(x)
print(y)
# x and y are broadcastable
# 1st trailing dimension: y has size 1
# 2nd trailing dimension: both have size 2
x*y

tensor([[0, 1, 2],
        [3, 4, 5]])
tensor([[-3],
        [ 3]])


tensor([[ 0, -3, -6],
        [ 9, 12, 15]])

In [98]:
x=torch.empty(3,5,7,9)
y=torch.empty(3,5,7,9)
# same shape are always broadcastable

In [99]:
x = torch.empty((0,))
y=torch.empty(2,2)
# x and y are not broadcastable
# x dose not have at least one dimension

In [100]:
x = torch.empty(5,3,4,1)
y=torch.empty(    3,1,1)
# x and y are broadcastable
#这里说的第一个维度,是指最右边那个维度
# 1st trailing dimension: both have size 1
# 2nd trailing dimension: y has size 1
# 3rd trailing dimension: x size ==y size
# 4th trailing dimension: y dim doesn't exist

In [101]:
x=torch.empty(5,2,4,1)
y=torch.empty(  3,1,1)
# x and y are not broadcastable
# in the 3rd trailing dimension 2！=3

# Sparse Tensors

In [126]:
indices = torch.LongTensor([[2,4,7],[3,2,1]])
values=torch.FloatTensor([3,4,5])
# sparse, 利用两个tensor生成,其中一个表示值，另一个表示索引
x=torch.sparse.FloatTensor(
indices, values, torch.Size([10,100000000]))
print(x)
# dense
m = torch.rand(100000000,1)
# sparse.mm相当于普通的torch.mm。可以用来求梯度，但是会返回一个sparse的矩阵。
torch.sparse.mm(x,m)

tensor(indices=tensor([[2, 4, 7],
                       [3, 2, 1]]),
       values=tensor([3., 4., 5.]),
       size=(10, 100000000), nnz=3, layout=torch.sparse_coo)


tensor([[0.0000],
        [0.0000],
        [0.1181],
        [0.0000],
        [3.1860],
        [0.0000],
        [0.0000],
        [0.5532],
        [0.0000],
        [0.0000]])

In [131]:
indices=torch.LongTensor([[2,4,7],[3,2,1]])
values=torch.FloatTensor([3,4,5])
# sparse
sparse_x=torch.sparse.FloatTensor(
indices, values, torch.Size([10,100000000])
)
# dense
dense_x=torch.zeros(10,100000000)
dense_x[2,3]=3
dense_x[4,2]=4
dense_x[7,1]=5
# dense
m=torch.rand(100000000,1)

In [141]:
# 这在iPython中被称为线魔术。它们的独特之处在于它们的参数只延伸到当前行的末尾
# 并且魔法本身实际上是用于命令行开发的。  timeit用于计算代码的执行时间。
# 从下面的代码可以看出，torch.saprse.mm的运算速度要比torch.mm快很多

In [142]:
%%timeit
torch.mm(dense_x, m)

298 ms ± 7.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [143]:
%%timeit
torch.sparse.mm(sparse_x,m)

17.2 µs ± 146 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


# Einstein Summation Notation

In [148]:
x=torch.arange(6).reshape(2,3)
x

tensor([[0, 1, 2],
        [3, 4, 5]])

In [149]:
# matrix transpose
torch.einsum('ij->ji',[x])

tensor([[0, 3],
        [1, 4],
        [2, 5]])

In [150]:
#sum
torch.einsum('ij->',[x])

tensor(15)

In [156]:
#column sum
torch.einsum('ij->j',[x])

tensor([3, 5, 7])

In [157]:
# row sum
torch.einsum('ij->i',[x])

tensor([ 3, 12])

In [161]:
# matrix-vector multiplication
y=torch.arange(3)
print(x)
print(y)
torch.einsum('ik,k->i',[x,y])

tensor([[0, 1, 2],
        [3, 4, 5]])
tensor([0, 1, 2])


tensor([ 5, 14])

In [162]:
# matrix-matrix multiplication
x=torch.arange(6).reshape(2,3)
y=torch.arange(15).reshape(3,5)
torch.einsum('ik,kj->ij',[x,y])

tensor([[ 25,  28,  31,  34,  37],
        [ 70,  82,  94, 106, 118]])

In [165]:
# vector dot product
x=torch.arange(3)
y=torch.arange(3,6)
print(x)
print(y)
torch.einsum('i,i->',[x,y])

tensor([0, 1, 2])
tensor([3, 4, 5])


tensor(14)

In [166]:
# matrix dot prodcut
x=torch.arange(6).reshape(2,3)
y=torch.arange(6,12).reshape(2,3)
print(x)
print(y)
torch.einsum('ij,ij->',[x,y])

tensor([[0, 1, 2],
        [3, 4, 5]])
tensor([[ 6,  7,  8],
        [ 9, 10, 11]])


tensor(145)

In [168]:
# Hadamard Product
x=torch.arange(6).reshape(2,3)
y=torch.arange(6,12).reshape(2,3)
print(x)
print(y)
torch.einsum('ij,ij->ij',[x,y])

tensor([[0, 1, 2],
        [3, 4, 5]])
tensor([[ 6,  7,  8],
        [ 9, 10, 11]])


tensor([[ 0,  7, 16],
        [27, 40, 55]])

In [170]:
# vector outer product(向量外积)
x=torch.arange(3)
y=torch.arange(3,7)
print(x)
print(y)
torch.einsum('i,j->ij',[x,y])

tensor([0, 1, 2])
tensor([3, 4, 5, 6])


tensor([[ 0,  0,  0,  0],
        [ 3,  4,  5,  6],
        [ 6,  8, 10, 12]])

In [184]:
# matrix diagonal
x=torch.arange(0,16).view(4,4)
print(x)
torch.einsum('ii->i',[x])

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15]])


tensor([ 0,  5, 10, 15])

In [174]:
# batch matrix-multiplication
x=torch.randn(3,2,5)
y=torch.randn(3,5,3)
#第i层的两个矩阵做普通矩阵乘法
torch.einsum('ijk,ikl->ijl',[x,y])

tensor([[[-2.2339,  0.2009, -0.6290],
         [-4.6447, -4.2725, -0.0324]],

        [[-1.5242,  2.6211,  0.6357],
         [-0.4332, -1.0161, -2.9737]],

        [[-0.9112,  0.5484, -1.3976],
         [-2.1933, -0.7030, -3.6636]]])

In [177]:
# tensor contraction(张量收缩)
x=torch.randn(2,3,5,7)
y=torch.randn(11,13,3,17,5)
torch.einsum('pqrs,tuqvr->pstuv',[x,y]).shape

torch.Size([2, 7, 11, 13, 17])

In [181]:
#bilinear transformation
x=torch.randn(2,3)
y=torch.randn(4,3,7)
z=torch.randn(2,7)
#k,l只在左边出现,没在右边出现,所以运算过程中会被求和掉，最后是一个2*4的矩阵
torch.einsum('ik,jkl,il->ij',[x,y,z])

tensor([[-0.9392,  1.3396, -2.0761,  4.5961],
        [-4.0851, -3.0293, -2.3530, -3.8772]])

# PyTorch Variables

In [103]:
# variable
x = torch.tensor([-1.5, 1.2], requires_grad=True)
# constant
y = torch.tensor([1.0, -1.3])
# variable
z =torch.tensor([-2.0, 0.2], requires_grad=True)
# output tensor value, but also computation graph
x*y@z

tensor(2.6880, grad_fn=<DotBackward>)

# Gradients

In [104]:
x=torch.tensor([-1.5, 1.2], requires_grad=True)
y = torch.tensor([1.0, -1.3])
z =torch.tensor([-2.0, 0.2], requires_grad=True)
r=x*y@z
# 只要计算r的变量有一个是varibale,即requires_grad=True,则r作为一个tensor带有backward()方法
print(r)
# 在r没有执行backward()方法之前,x.grad没有被赋值
print(x.grad)
r.backward()
x.grad
# Torch的一大优势就是可以自动求梯度，甚至有一种观点认为Torch就是带有自动求梯度的numpy

tensor(2.6880, grad_fn=<DotBackward>)
None


tensor([-2.0000, -0.2600])

# Backpropagation

In [105]:
torch.manual_seed(7)
a= torch.rand(1,requires_grad=True)
b=torch.rand(1,requires_grad=True)
c=torch.rand(1,requires_grad=True)
d=torch.rand(1,requires_grad=True)
e=torch.rand(1,requires_grad=True)
# compare to: f=c*a*b+d*a*b, 结论:没有区别
f=c*a*b+d*a*b
f.backward()
print(c.grad)
print(e.grad)
c.grad.zero_() # c.grad.zero_()的解释见下
e=a*b
f=c*e+d*e
f.backward()  
print(c.grad)
print(e.grad)
#注意,e是中间产物,相当于神经网络里面每一层的output,它是不能被求梯度，也是不需要被求梯度

tensor([0.1063])
None
tensor([0.1063])
None


In [106]:
torch.manual_seed(7)
a= torch.rand(1,requires_grad=True)
b=torch.rand(1,requires_grad=True)
c=torch.rand(1,requires_grad=True)
d=torch.rand(1,requires_grad=True)
f=c*a*b+d*a*b
f.backward()
print(c.grad)
e=a*b
f=c*e+d*e
f.backward()
print(c.grad)
# 反向传播的时候,torch自动反向传播,得到的c.grad会被存下来,下次再反向传播,会被叠加上去,所以要进行清零：c.grad.zero_()

tensor([0.1063])
tensor([0.2127])


# PyTorch Autograd Function
参考资料:https://blog.csdn.net/tsq292978891/article/details/79364140

In [317]:
# 这是课上pdf上关于PyTorch Autograd Function的说明的改进版,函数用的是cos(x)而不用e^x

from torch.autograd import Function 
# Function源码见:https://github.com/0pytorch/pytorch/blob/master/torch/autograd/function.py

class MyExp(Function):    # MyExp类继承Function类
    @staticmethod         # 装饰器@staticmethod定义的是一种静态方法
    def forward(ctx, x):  # x,ctx也可以改成其他变量名，只不过这个ctx和下面backward的ctx占用同一片内存
        print(ctx)        # ctx是apply
        result = torch.cos(x)  # result的值是下一结点y的值
        ctx.save_for_backward(result, x)  # ctx存的值,会传到backward中去,对指数函数,tanh,sigmoid,函数值可以用来求导
                                          # 所以对这些函数,存入ctx的往往是result值,当然这里不是这种情况
        return result             # 返回的result值存入y中

    @staticmethod
    def backward(ctx, grad_output):  # 这里的grad_outputs是存进来的upsteram_gradient
        print("grad_output is",grad_output) # upstream_gradient默认值为tensor([1]),这可能与MyExp继承自Function有关
        result, x = ctx.saved_tensors  # 注意ctx.saved_tensors是一个list,所以如果左端只有一个result接收值,
                                       # 就需要左端写result, 来使result接受list中的元素值
        return -torch.sin(x) * grad_output  # 这里的-torch.sin(x)表示的是local_gradient


x = torch.tensor([0.], requires_grad=True)
y = MyExp.apply(x)  # 这里的apply相当于说启动MyExp中的forward方法,
                    # forward的非ctx参数是x,forward方法返回的值存入y中
print("y is",y)

y.backward()  # 这里的backward相当于说启动MyExp中的backward方法,
              # backward的非ctx参数是upstream_gradient
              # 如果backward方法未设置参数,则upstream_gradient默认值为1,如果设置了参数,则按照参数来
              # 注意,backward方法的返回值不是返回到y,而是返回到x.grad
x.grad        # x.grad的返回值是上面backward方法中的返回值

<torch.autograd.function.MyExpBackward object at 0x12ea93828>
y is tensor([1.], grad_fn=<MyExpBackward>)
grad_output is tensor([1.])


tensor([-0.])

# PyTorch Modules and examples

*所有网络成员都应该从nn.Module上继承并且重载forward方法<br>
*使用一个module提供了函数性:<br>
    *训练变量可追踪<br>
    *让你能能轻松在CPU和GPU之间跳转(见.to(device)方法)<br>
*要将一个可变tensor注册到一个module的参数中去,你需要用nn.Parameter去封装它<br>

In [344]:
# Linear Model Example
import torch.nn as nn # nn.Module是所有神经网络单元(neural network modules)的基类
class LinearModule(torch.nn.Module): # LinearModule继承了nn.Module
    def __init__(self, x_dim, y_dim):
        super(LinearModule, self).__init__() # 继承父类的构造函数,可以利用父类的变量了,LinearModule,slef可以省略
#         torch.nn.Module.__init__(self)       # super().__init__()相当于torch.nn.Module.__init__(self)
        self.W=nn.Parameter(torch.randn(y_dim,x_dim,requires_grad=True)) # 
        self.b=nn.Parameter(torch.randn(y_dim),requires_grad=True)
    def forward(self,x):
        return self.W@x+self.b 
# Some random input and output data
# x = torch.randn(5)
# y = torch.randn(2)

model=LinearModule(5,2)   # model是LinearModule的一个实例,实际上是调用了__init__方法(一种magic方法)

for param in model.parameters():  # model.parameters是参数
    print(param.size())
model(x)                  # 虽然model是一个实例,但这里实际上调用了__call__方法(一种magic方法)

torch.Size([2, 5])
torch.Size([2])


tensor([ 2.0095, -0.0135], grad_fn=<AddBackward0>)

# Function与Module的差异与应用场景
参考资料:https://blog.csdn.net/mdjxy63/article/details/79474966<br>
Function与Module都可以对pytorch进行自定义拓展，使其满足网络的需求，但这两者还是有十分重要的不同：<br>
Function一般只定义一个操作,因为其无法保存参数,因此适用于激活函数、pooling等操作;<br>
Module是保存了参数，因此适合于定义一层，如线性层，卷积层，也适用于定义一个网络<br>
Function需要定义三个方法：\_\_init\_\_,forward,backward(需要自己写求导公式);<br>
Module：只需定义\_\_init\_\_和forward,而backward的计算由自动求导机制构成<br>
可以不严谨的认为，Module是由一系列Function组成，因此其在forward的过程中，Function和Variable组成了计算图,在backward时,只需调用Function的backward就得到结果,因此Module不需要再定义backward.<br>
Module不仅包括了Function,还包括了对应的参数,以及其他函数与变量,这是Function所不具备的

# Gradient Checking
see torch.autograd.gradcheck

我们如何确定我们的反向传播正确实现了呢？用"有限差分近似"<br>
二阶中心差商(一维输出情形):
$$
\frac { \partial f ( \mathbf { x } ) } { \partial \mathbf { x } } \approx \frac { 1 } { 2 \epsilon } ( f ( \mathbf { x } + \epsilon ) -  f ( \mathbf { x } - \epsilon ) )
$$
多维输出情形:
$$
\boldsymbol { d } ^ { \top } \nabla f ( \boldsymbol { x } ) \approx \frac { 1 } { 2 \varepsilon } ( f ( \boldsymbol { x } + \varepsilon \cdot \boldsymbol { d } ) - f ( \boldsymbol { x } - \varepsilon \cdot \boldsymbol { d } ) )
$$
where, $d \in \mathbb { R } ^ { n }$ 是任意一个方向的向量,测试的时候,会多次尝试各种向量方向，有时候选择n个标准基向量,不过随机选择方向也足够好.注意一定要用中心差商.<br>
为什么不直接用有限差商代替梯度？<br>
对低维函数而言,我们可以用有限差商代替梯度.它也足够精确.所以我们不选用有限差商的原因不是因为它足够精确,而是它足够高效,因为有限差商的时间复杂度是自动求梯度的时间复杂度的n倍.
详细讨论见:https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/

# \_\_init\_\_方法
参考资料:<br>
https://blog.csdn.net/hellocsz/article/details/82795514<br>
https://www.cnblogs.com/insane-Mr-Li/p/9758776.html<br>
既然__init__方法也是类的一个方法,那用和不用__init__方法有什么区别呢?

In [321]:
# 不用__init__方法定义类
class Rectangle():
    def getPeri(self, a, b):

        return (a + b) * 2

    def getArea(self, a, b):

        return a * b


rect = Rectangle()

print(rect.getPeri(3, 4))

print(rect.getArea(3, 4))

print(rect.__dict__)  # 可以用__dict__查看实例的属性

#可以看出,不用__init__方法定义的类构造的实例是没有属性的,这个实例只象征能方法的集合.

14
12
{}


In [322]:
# 用__init__方法定义类
class Rectangle():
    def __init__(self, a, b):

        self.a = a

        self.b = b

    def getPeri(self):

        return (self.a + self.b) * 2

    def getArea(self):

        return self.a * self.b


rect = Rectangle(3, 4)

print(rect.getPeri())

print(rect.getArea())

print(rect.__dict__)

#可以看出,不用__init__方法定义的类构造的实例是有属性的,而且可以对自身属性做运算.

14
12
{'a': 3, 'b': 4}


# Python中的\*args and **Kargs
在方法的参数个数不固定的情况下,我们需要让我们的方法去调用可变长的参数(这样的方法泛用性好),这时就需要引入\*args和**Kargs<br>
参考资料:<br>
https://www.cnblogs.com/chaojiyingxiong/p/9223754.html<br>
https://www.cnblogs.com/yunguoxiaoqiao/p/7626992.html

In [311]:
# python中规定参数前带*的,称为可变位置参数,一般用*args来表示
# *args用来将参数打包成tuple给函数体(方法)调用
def Jiafa(*args):
    sum = 0
    for i in args:
        sum = sum + i
    print(sum)
Jiafa(1, 3, 5)
Jiafa(2, 4, 6, 8, )

9
20


In [312]:
# python中规定参数前带**的,称为可变关键字参数,一般用**Kargs来表示
# **kwargs 打包关键字参数成dict给函数体(方法)调用
def zidian(**kwargs):

    print(kwargs)

zidian(a=1,b=2,c =3)
zidian(a=1,b=2,c =3 ,d =4)

{'a': 1, 'b': 2, 'c': 3}
{'a': 1, 'b': 2, 'c': 3, 'd': 4}


# instance method,class method,static method
是否与类或者实例进行绑定，这就是实例方法，类方法，静态方法的区别。(详细讨论见参考资料)<br>
参考资料:https://blog.csdn.net/lihao21/article/details/79762681<br>
下面是一些例子

## instance method(实例方法)

In [304]:
class Kls(object):
    def __init__(self, data):
        self.data = data

    def printd(self):
        print(self.data)


ik1 = Kls('leo')
ik2 = Kls('lee')

ik1.printd()
ik2.printd()
#调用实例方法的时候,实例会作为参数传入方法

leo
lee


## class method(类方法)

In [307]:
class Kls(object):
    num_inst = 0

    def __init__(self):
        Kls.num_inst = Kls.num_inst + 1

    @classmethod       # 类方法修饰器
    def get_no_of_instance(cls):
        return cls.num_inst


ik1 = Kls()
ik2 = Kls()

print(ik1.get_no_of_instance())
print(Kls.get_no_of_instance())
# 调用实例方法的时候,类会作为参数传入方法,所以可以用类调用类方法,也可以用实例调用类方法

2
2


## static method(静态方法)

In [308]:
IND = 'ON'


class Kls(object):
    def __init__(self, data):
        self.data = data

    @staticmethod
    def checkind():
        return IND == 'ON'

    def do_reset(self):
        if self.checkind():
            print('Reset done for: %s' % self.data)

    def set_db(self):
        if self.checkind():
            print('DB connection made for: %s' % self.data)


ik1 = Kls(24)
ik1.do_reset()
ik1.set_db()
# 调用静态方法,不需要将实例或者类作为参数传入静态方法,所以可以认为静态方法是个纯函数,输入的变量就是你给的那些变量.
# 因为静态方法不需要将实例或者类作为参数传入,所以实例和类都可以调用静态方法
# 静态方法很像是在实例和类外面定义的函数,只是我们可以通过实例和类来使用这个函数而已

Reset done for: 24
DB connection made for: 24


# 类的继承,重写,super函数

## Object类-新式类，经典类
*新式类是指继承object类的类<br>
*经典类是指没有继承object类的类<br>
参考：<br>
https://www.cnblogs.com/attitudeY/p/6789370.html<br>
http://www.runoob.com/note/28629<br>
为什么要引入新式类呢？(因为使用经典类在多继承问题上会有bug),具体地:
![title](3.png)
BC 为A的子类,D为BC的子类,A中有save方法,C对其进行了重写<br>
在经典类中 调用D的save方法 搜索按深度优先 路径B-A-C， 执行的为A中save 显然不合理<br>
在新式类的 调用D的save方法 搜索按广度优先 路径B-C-A， 执行的为C中save<br>
但是,事实上,上面的问题只出现在python2.7中<br>
在python3中,新式类已经兼容了经典类了,即无论你是否继承object类,都是使用广度优先搜索,执行的是C中的save<br>
下面见实例:

In [284]:
# 实例
# 经典类
class A:
    def __init__(self):
        print('this is A')
    def save(self):
        print('Come from A')

class B(A):    # B继承类A,并且没有重写save方法
    def __init__(self):
        print('this is B')

class C(A):    # C继承类A,并且重写save方法
    def __init__(self):
        print('this is C')
    def save(self):
        print('Come from C')
        
class D(B,C):
    def __init__(self):
        print('this is D')
d1=D()      # 用类D初始化一个对象d1
d1.save()   # 若是python2.7的版本,这时候输出的是Come from A

this is D
Come from C


In [290]:
#新式类
class A(object):
    def __init__(self):
        print ('this is A')

    def save(self):
        print ('come from A')

class B(A):
    def __init__(self):
        print ('this is B')

class C(A):
    def __init__(self):
        print ('this is C')
    def save(self):
        print ('come from C')

class D(B,C):
    def __init__(self):
        print ('this is D')

d1=D()
d1.save()   

this is D
come from C


## 类的继承,类的重写

In [295]:
# 这个例子既包含类的普通继承,类的结构函数继承,又包含类的重写
class FooParent(object):        # FooParent(父类),object可以省略
    def __init__(self):
        self.parent = 'I\'m the parent.'
        print ('Parent')
    
    def bar(self,message):
        print ("%s from Parent" % message)
 
class FooChild(FooParent):     # FooChild(子类)继承FooParent(父类)
    def __init__(self):        # self是当前类的实例,为了方便在类定义中调用其自己类对应的值和函数,详见知乎专栏
        super(FooChild,self).__init__() # 调用父类,第一个括号的参数中的参数可以省略(python3版本下)
#         FooParent.__init__(self)     # 这行代码可以替换super(FooChild,self).__init__(),功能完全一样
        print ('Child')
        
    def bar(self,message):      #
        super().bar(message)            # 省略了第一个括号下的参数
        print ('Child bar fuction')
        print (self.parent)
 
if __name__ == '__main__':  # 只在运行本程序是调用下面的内容,在被当成库函数导入其他地方时不调用下面内容,详见知乎专栏-pyhton学习
    fooChild = FooChild()   
    fooChild.bar('HelloWorld')
#生成结果的过程:fooChild对象被初始化,由super所以调用了父类初始化函数,为self.parent赋值"I'm the parent"
#打印了Parent,接下来调用子类初始化函数打印了Child,接着fooChild传入参数HelloWorld并调用子类bar函数
#又由super所以传入HelloWorld调用了父类bar函数,打印了”HelloWorld from Parent“,继续往下,打印了“Child bar funciton”
#最后打印self.parent"I'm the parent"

Parent
Child
HelloWorld from Parent
Child bar fuction
I'm the parent.


In [289]:
# 这个例子只包含类的普通继承
class FooParent(object):        # FooParent(父类)
    def __init__(self):
        self.parent = 'I\'m the parent.' # 父类参数初始化
        print ('Parent')
    
    def parent_bar(self,message):  # 父类方法
        print ("%s from Parent" % message)
 
class FooChild(FooParent):     # FooChild(子类)继承FooParent(父类)
    def __init__(self):        # 专业术语:构造函数(初始化方法)
#         super(FooChild,self).__init__() # 只能通过super调用父类初始化方法
#         print (self.parent)   # 若没有申明调用父类初始化方法,则无法使用父类变量,即继承并没有将父类变量继承过来   
        print ('Child')
        
    def child_bar(self,message):      # 子类方法
        print ('Child bar fuction')
 
if __name__ == '__main__':  
    fooChild = FooChild()
    fooChild.parent_bar('HelloWorld')
    fooChild.child_bar('HelloWorld')
# 注意这里和上面不一样的一点是,上面定义和父类方法和子类方法是同名的,这时候,python默认先去调用子类的那个同名方法,
# 而父类的那个同名方法则无效,这就是类的重构,所以这时只能通过super来调用父类方法
# 而这是这两个方法通过直接继承的途径，直接调用父类.
# 虽然父类方法可以通过继承的方式被调用,但父类的初始化参数没有因为继承而直接被调用(原因是父类的初始化方法__init__根本没有启动)

Child
HelloWorld from Parent
Child bar fuction


#  Loss Functions:$\quad$ $L(f_{\theta}(x),y)$
* Least squares[nn.MSELoss]
$$\frac{1}{n}(f_{\theta}(x)-y)^2$$
$$\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
    l_n = \left( x_n - y_n \right)^2$$
$$  \ell(x, y) =
    \begin{cases}
        \operatorname{mean}(L), & \text{if}\; \text{reduction} = \text{True},\\
        \operatorname{sum}(L),  & \text{if}\; \text{reduction} = \text{False}.
    \end{cases} $$
* Logistic[nn.SoftMarginLoss]
$$log(1+exp(-yf_{\theta}(x))) $$
$$\text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}$$
* Hinge loss[nn.MultiMarginLoss/nn.MultiLabelMarginLoss]
$$max(0,1-yf_{\theta}(x)) $$
$$\text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}$$
* Cross-entropy[nn.CrossEntropyLoss]
$$-[ylog(f_{\theta}(x))-(1-y)log(1-f_{\theta}(x))]   $$
$$\text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
                   = -x[class] + \log\left(\sum_j \exp(x[j])\right)$$
$$\text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)$$
* ……and many more: https://pytorch.org/docs/stable/nn.html#loss-functions

In [403]:
# Least squares[nn.MSELoss]

torch.manual_seed(1)

# reduction="mean"

input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5)
loss=nn.MSELoss(reduction='mean')    
print(sum(sum((input-target)*(input-target)/15)))
output = loss(input, target)
print(output)

# reduction="sum"
loss=nn.MSELoss(reduction='sum')    
print(torch.sum((input-target)*(input-target)))
output = loss(input, target)
print(output)

tensor(2.3955, grad_fn=<AddBackward0>)
tensor(2.3955, grad_fn=<MseLossBackward>)
tensor(35.9326, grad_fn=<SumBackward0>)
tensor(35.9326, grad_fn=<MseLossBackward>)


In [402]:
#  Logistic[nn.SoftMarginLoss]

torch.manual_seed(1)

# reduction="mean"

input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5)
loss=nn.SoftMarginLoss(reduction='mean')
# print(sum(sum((input-target)*(input-target)/15)))
print(sum(sum(torch.log(1+torch.exp(-input*target))))/15)  # sum默认是求和一个维度
output = loss(input, target)
print(output)
output.backward()

input.grad.zero_()    # 不进行这个步骤,后面得到的input.grad会叠加上这里得到的

# reduction="sum"

loss=nn.SoftMarginLoss(reduction='sum')
print(torch.sum(torch.log(1+torch.exp(-input*target))))   # torch.sum默认是求和所有
output = loss(input, target)
print(output)
output.backward()

input.grad

tensor(0.7085, grad_fn=<DivBackward0>)
tensor(0.7085, grad_fn=<SoftMarginLossBackward>)
tensor(10.6279, grad_fn=<SumBackward0>)
tensor(10.6279, grad_fn=<SoftMarginLossBackward>)


tensor([[-0.1219,  0.2224,  0.2594,  1.1426,  0.0599],
        [-2.3126,  0.1061, -0.5628,  0.2709, -0.1776],
        [ 0.1891, -0.7172, -1.5706, -0.2456,  0.5956]])

In [431]:
# Hinge loss[nn.MultiMarginLoss/nn.MultiLabelMarginLoss]
# reduction属性也有mean和sum,和上面类似,这里不再重复写了
input = torch.randn(3, 5, requires_grad=True)
# target = torch.randn(3, 5)   这是分类问题,不应该用随机生成float类型,而应该用randint
target=torch.randint(0,2,(3,5))
print(target)
loss=nn.MultiLabelMarginLoss()
output = loss(input, target)
print(output)

tensor([[0, 1, 1, 0, 0],
        [1, 0, 1, 1, 1],
        [1, 0, 0, 1, 1]])
tensor(2.5594, grad_fn=<MultilabelMarginLossBackward>)


In [432]:
# Cross-entropy[nn.CrossEntropyLoss]
# reduction属性也有mean和sum,和上面类似,这里不再重复写了
input = torch.randn(3, 5, requires_grad=True)
# target = torch.randn(3, 5)   这是分类问题,不应该用随机生成float类型,而应该用randint
target = torch.empty(3, dtype=torch.long).random_(5)
print(target)
loss=nn.CrossEntropyLoss()
output = loss(input, target)
print(output)

tensor([2, 0, 4])
tensor(1.7507, grad_fn=<NllLossBackward>)


In [7]:
a=torch.rand(3,4)
print(a)
a.sum(0)

tensor([[0.7653, 0.2033, 0.4984, 0.2717],
        [0.3400, 0.4109, 0.2760, 0.0834],
        [0.6182, 0.2758, 0.8907, 0.3001]])


tensor([1.7235, 0.8901, 1.6652, 0.6551])