# Softmax and Cross-Entropy

In [1]:
import numpy as np
import torch
import torch.nn as nn

Numpy implementation

In [3]:
#default axis none
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x),axis=0)

In [4]:
x=np.array([2.0,1.0,0.1])
outputs=softmax(x)
print('softmax numpy:',outputs)

softmax numpy: [0.65900114 0.24243297 0.09856589]


Pytorch implementation

In [5]:
x=torch.tensor([2.0,1.0,0.1])
outputs=torch.softmax(x,dim=0)
print(outputs)

tensor([0.6590, 0.2424, 0.0986])


Cross Entropy Loss

In [6]:
def cross_entropy(actual, predicted):
    loss= -np.sum(actual * np.log(predicted))
    return loss #/float(predicted.shape[0])

In [7]:
#y must be one hot encoded
#if class 0:[1 0 0]
#if class 1:[0 1 0]
#if class 2:[0 0 1]
Y= np.array([1,0,0])

In [9]:
#y_pred must has probabilities
Y_pred_good=np.array([0.7,0.2,0.1])
Y_pred_bad=np.array([0.1,0.3,0.6])
l1=cross_entropy(Y, Y_pred_good)

l2=cross_entropy(Y, Y_pred_bad)

In [10]:
print(f'Loss1 numpy:{l1:.4f}')
print(f'Loss2 numpy:{l2:.4f}')

Loss1 numpy:0.3567
Loss2 numpy:2.3026


Pytorch implementation

In [20]:
#no Softmax in Last Layer
#Y has class labels, not One-Hot!
#Y_pred has raw scores(logits)
#nn.crossentropy loss applies both nn.logsoftmax+nn.NLLLoss*negative log likelihood loss)
#We don't have to implement the softmax layer by ourselves
loss=nn.CrossEntropyLoss()
Y = torch.tensor([0]) 
#nsamples x nclasses =1x3  #reshaping
#view(1, -1) is used to reshape the 1-dimensional tensor to a 2-dimensional tensor with a batch size of 1 and an automatic determination of the number of columns (num_classes). 
Y_pred_good = torch.tensor([2.0,1.0,0.1]).view(1, -1)
Y_pred_bad = torch.tensor([0.5,2.0,0.3]).view(1, -1)


In [21]:
l1=loss(Y_pred_good, Y)
l2=loss(Y_pred_bad, Y)

In [22]:
print(l1.item())
print(l2.item())

0.4170299470424652
1.840616226196289


In [25]:
#_: This is a placeholder variable. In this context, it's used to capture the maximum values, but since you're not interested in them, you use _ as a convention to indicate that you're ignoring this part of the result.
#we don't need this that's why we used _
_, predictions1=torch.max(Y_pred_good, 1)
_, predictions2=torch.max(Y_pred_bad, 1)
print(predictions1)
print(predictions2)

tensor([0])
tensor([1])


In [26]:
#3 samples
Y = torch.tensor([2,0,1])
#nsamples x nclasses =3x3

In [28]:
Y_pred_good = torch.tensor([[0.1,1.0,2.1],[2.0,1.0,0.1],[1.0,2.0,0.1]]).view(3, -1)
Y_pred_bad = torch.tensor([[0.5,2.0,0.3],[0.5,2.0,0.3],[0.5,2.0,0.3]]).view(3, -1)


In [29]:
l1=loss(Y_pred_good, Y)
l2=loss(Y_pred_bad, Y)

In [30]:
print(l1.item())
print(l2.item())

0.40603378415107727
1.407282829284668


In [31]:
#_: This is a placeholder variable. In this context, it's used to capture the maximum values, but since you're not interested in them, you use _ as a convention to indicate that you're ignoring this part of the result.
#we don't need this that's why we used _
_, predictions1=torch.max(Y_pred_good, 1)
_, predictions2=torch.max(Y_pred_bad, 1)
print(predictions1)
print(predictions2)

tensor([2, 0, 1])
tensor([1, 1, 1])


Multiclass classification

In [41]:
class NeuralNet2(nn.Module):
    def __init__(self,input_size,hidden_size,num_classes):
        super(NeuralNet2,self).__init__()
        self.linear1 = nn.Linear(input_size,hidden_size)
        self.relu =nn.ReLU()
        self.linear2=nn.Linear(hidden_size,num_classes)
        
    def forward(self,x):
        out=self.linear1(x)
        out=self.relu(out)
        out=self.linear2(out)
        #no softmax at the end
        return out

In [42]:
model=NeuralNet2(input_size=28*28, hidden_size=5, num_classes=3)
criterion=nn.CrossEntropyLoss() #applies Softmax

Binary Classification

In [43]:
class NeuralNet1(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(NeuralNet1,self).__init__()
        self.linear1 = nn.Linear(input_size,hidden_size)
        self.relu =nn.ReLU()
        self.linear2=nn.Linear(hidden_size,1)
        
    def forward(self,x):
        out=self.linear1(x)
        out=self.relu(out)
        out=self.linear2(out)
        #sigmoid at end
        y_pred=torch.sigmoid(out)
        return y_pred

In [44]:
model=NeuralNet1(input_size=28*28, hidden_size=5)
criterion=nn.BCELoss()