# softmax函数解析

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import numpy as np

## 从tensor 和 Tensor 开始

### Tensor默认是float类型，而tensor需要你指明数据类型

In [16]:
num1 = np.array([1,2,3])
exp1 = torch.Tensor(num1) # 默认是float类型
F.softmax(exp1, dim=0)

tensor([0.0900, 0.2447, 0.6652])

In [17]:
exp1 = torch.tensor([1,2,3], dtype=torch.float) # 默认是float类型
F.softmax(exp1, dim=0)

tensor([0.0900, 0.2447, 0.6652])

In [12]:
F.sigmoid(exp1)



tensor([0.7311, 0.8808, 0.9526])

## 手动算softmax函数

In [77]:
np.exp(num1)/np.sum(np.exp(num1))

array([0.09003057, 0.24472847, 0.66524096])

In [97]:
# 和上面结果保持一致
softmax = nn.Softmax(dim=0)
y = softmax(exp1)
y

tensor([0.0900, 0.2447, 0.6652])

## logsoftmax函数

In [105]:
logsoftmax = nn.LogSoftmax(dim=0)
y1 = logsoftmax(exp1)
y1

tensor([-2.4076, -1.4076, -0.4076])

## 负对数似然函数

### 负对数似然就是取出a中对应的target位置的位置，并取负号，比如target1 = 0，那么就取a[0]同时乘一个负数

In [108]:
a = torch.Tensor([[1,2,3]])
nll = nn.NLLLoss()
target1 = torch.Tensor([0]).long()
target2 = torch.Tensor([1]).long()
target3 = torch.Tensor([2]).long()
 
#测试
n1 = nll(a,target1)
#输出：tensor(-1.)
n2 = nll(a,target2)
#输出：tensor(-2.)
n3 = nll(a,target3)
#输出：tensor(-3.)

In [109]:
print(n1, n2, n3)

tensor(-1.) tensor(-2.) tensor(-3.)


In [119]:
a = torch.Tensor([[1,2,3]])
target = torch.Tensor([2]).long()
logsoftmax = nn.LogSoftmax()
nll = nn.NLLLoss()
ce = nn.CrossEntropyLoss()
 
#测试CrossEntropyLoss
cel = ce(a,target)
print(cel)
#输出：tensor(0.4076)
 
#测试LogSoftmax+NLLLoss
lsm_a = logsoftmax(a)
nll_lsm_a = nll(lsm_a,target)
print(nll_lsm_a)
#输出tensor(0.4076)

tensor(0.4076)
tensor(0.4076)


  del sys.path[0]


## 总结

nn.LogSoftmax是在softmax的基础上取自然对数

nn.NLLLoss是负的似然对数损失，但Pytorch的实现就是把对应target上的数取出来再加个负号，要在CrossEntropy中结合LogSoftmax来用;

BCELoss是二分类的交叉熵损失，Pytorch实现中和多分类有区别;

https://zhuanlan.zhihu.com/p/411194855

BCELoss需要的y_ture和y_pred的shape必须相同，在单标签2分类情况下，要么都是(batch,1)，要么都是(batch)。

解法：y = y.reshape(-1)，或者y = y.reshape(-1,1) 可以放在model内部，也可以放在训练主循环上。
BCELoss需要的y_ture和y_pred的数据类型都必须是浮点数，而且要相同，要么都是float32，要么都是float64。但是我们y_true很有和可能read_csv进来是一个int。

解法：我的习惯是在dataloader就用astype(float32)将数据类型和数据的维度设置好，你也可以放在训练主循环上。

In [114]:
# Cross entropy example
# One hot
# 0: 1 0 0
# 1: 0 1 0
# 2: 0 0 1
Y = np.array([1, 0, 0])

Y_pred1 = np.array([0.7, 0.2, 0.1])
Y_pred2 = np.array([0.1, 0.3, 0.6])
print("loss1 = ", np.sum(-Y * np.log(Y_pred1)))
print("loss2 = ", np.sum(-Y * np.log(Y_pred2)))

# Softmax + CrossEntropy (logSoftmax + NLLLoss)
loss = nn.CrossEntropyLoss()

loss1 =  0.35667494393873245
loss2 =  2.3025850929940455


In [43]:
-Y * np.log(Y_pred1)
y1 = torch.Tensor(Y)
y_pred = torch.Tensor(Y_pred1)
y_pred.type()

'torch.FloatTensor'

In [45]:

# target is of size nBatch
# each element in target has to have 0 <= value < nClasses (0-2)
# Input is class, not one-hot
Y = Variable(torch.LongTensor([0]), requires_grad=False)

# input is of size nBatch x nClasses = 1 x 4
# Y_pred are logits (not softmax)
Y_pred1 = Variable(torch.Tensor([[2.0, 1.0, 0.1]]))
Y_pred2 = Variable(torch.Tensor([[0.5, 2.0, 0.3]]))

l1 = loss(Y_pred1, Y)
l2 = loss(Y_pred2, Y)

print("PyTorch Loss1 = ", l1.data, "\nPyTorch Loss2=", l2.data)

print("Y_pred1=", torch.max(Y_pred1.data, 1)[1])
print("Y_pred2=", torch.max(Y_pred2.data, 1)[1])

PyTorch Loss1 =  tensor(0.4170) 
PyTorch Loss2= tensor(1.8406)
Y_pred1= tensor([0])
Y_pred2= tensor([1])


In [47]:
# target is of size nBatch
# each element in target has to have 0 <= value < nClasses (0-2)
# Input is class, not one-hot
Y = torch.LongTensor([2, 0, 1])

# input is of size nBatch x nClasses = 2 x 4
# Y_pred are logits (not softmax)
Y_pred1 = torch.Tensor([[0.1, 0.2, 0.9], [1.1, 0.1, 0.2], [0.2, 2.1, 0.1]])

Y_pred2 = torch.Tensor([[0.8, 0.2, 0.3], [0.2, 0.3, 0.5], [0.2, 0.2, 0.5]])

l1 = loss(Y_pred1, Y)
l2 = loss(Y_pred2, Y)

print("Batch Loss1 = ", l1.data, "\nBatch Loss2=", l2.data)
a = np.array([[1, 2, 3], [2, 4, 5]])
b = torch.from_numpy(a)

Batch Loss1 =  tensor(0.4966) 
Batch Loss2= tensor(1.2389)


In [49]:
print(b.max(1))

torch.return_types.max(
values=tensor([3, 5]),
indices=tensor([2, 2]))


In [50]:
linear = torch.nn.Linear(32, 2)

In [51]:
inputs = torch.rand(3, 32)

In [52]:
outputs = linear(inputs)

In [53]:
outputs

tensor([[ 0.1629, -0.4582],
        [ 0.0217, -0.8183],
        [-0.0483, -0.6999]], grad_fn=<AddmmBackward0>)

In [54]:
activation = F.sigmoid(outputs)

In [55]:
activation

tensor([[0.5406, 0.3874],
        [0.5054, 0.3061],
        [0.4879, 0.3318]], grad_fn=<SigmoidBackward0>)

In [11]:
F.softmax(outputs)

  """Entry point for launching an IPython kernel.


tensor([[0.5013, 0.4987],
        [0.5417, 0.4583],
        [0.5213, 0.4787]], grad_fn=<SoftmaxBackward0>)

In [12]:
F.relu(outputs)

tensor([[0., 0.],
        [0., 0.],
        [0., 0.]], grad_fn=<ReluBackward0>)

In [13]:
a1 = torch.tensor([1.,2.,3.])

In [14]:
F.softmax(a1)

  """Entry point for launching an IPython kernel.


tensor([0.0900, 0.2447, 0.6652])

## argmax函数

In [57]:
x = np.array([-2, 3, -9.4, 5, 0])
x

array([-2. ,  3. , -9.4,  5. ,  0. ])

In [59]:
ex = np.sum(np.exp(x)/np.sum(np.exp(x))*np.array([0, 1, 2, 3, 4]))
ex

2.766691018220332

## softmax

In [61]:
x = np.array([0.5, 2, 1.4, 3, 9.1, -1])

In [63]:
y = np.exp(x)/np.sum(np.exp(x))

In [64]:
np.sum(y)

0.9999999999999999

In [14]:
x = torch.randn(10, 3, 100)

torch.Size([10, 3, 100])

In [42]:
x = torch.rand([2,3,4])

In [43]:
x

tensor([[[0.1092, 0.6180, 0.7480, 0.8949],
         [0.9619, 0.0366, 0.4140, 0.2731],
         [0.3046, 0.3404, 0.3474, 0.3618]],

        [[0.6344, 0.1076, 0.8911, 0.0222],
         [0.7427, 0.7286, 0.8148, 0.7073],
         [0.8111, 0.8410, 0.5518, 0.0945]]])

In [59]:
prob = nn.Softmax(dim=1)
prob(x)

tensor([[[0.2192, 0.4316, 0.4191, 0.4709],
         [0.5143, 0.2413, 0.3001, 0.2528],
         [0.2665, 0.3270, 0.2808, 0.2763]],

        [[0.3023, 0.2023, 0.3790, 0.2464],
         [0.3369, 0.3765, 0.3511, 0.4888],
         [0.3608, 0.4212, 0.2699, 0.2648]]])

In [60]:
y = torch.Tensor([[1,2,3], [1,2,3]])

In [61]:
prob(y)

tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])