In [10]:
import os

download_name = "helper.py"
if not os.path.exists(download_name):
    import requests
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/demos/{download_name}")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()

In [9]:
import torch
import torch.nn.functional as F
from helper import look

# Binary cross-entropy Loss

$L(\hat{y}, y) = -\frac{1}{n}\sum_{i=1}^{n}(y_i log(\hat{y}_i) + (y_i log(\hat{y}_i))$

# Binary case

In [2]:
torch.manual_seed(0)

# Binary setting ##############################################################
look("# Binary case")
z = torch.randn(5)
look("$z=$", z)
yhat = torch.sigmoid(z)
look("$\hat{y} = \sigma(z)=$", yhat)
y = torch.Tensor([0, 1, 1, 0, 1])
look("$y=$", y)

# First compute the negative log likelihoods using the derived formula
l = -(y * yhat.log() + (1 - y) * (1 - yhat).log())
look("**Negative log-likelihood** $-(y log(\hat{y}) + (1 - y) log(1 - \hat{y}))=$", l)
for yhat_i, y_i, l_i in zip(yhat, y, l):
    look("&nbsp;&nbsp;&nbsp;&nbsp;", y_i.long(), "$ \cdot log($", yhat_i ,"$) + (1 - $", y_i.long(), "$) \cdot log(1-$", yhat_i ,"$)=-$", l_i)

# Observe that BCELoss and BCEWithLogitsLoss can produce the same results
l_BCELoss_nored = torch.nn.BCELoss(reduction="none")(yhat, y)
l_BCEWithLogitsLoss_nored = torch.nn.BCEWithLogitsLoss(reduction="none")(z, y)
look("**BCE** $L(\hat{y}, y)=$", l_BCELoss_nored)
look("**BCELogit** $L(\hat{y}, z)=$", l_BCEWithLogitsLoss_nored)




# Binary case

$z=$ $\begin{bmatrix} 1.54 & -0.293 & -2.18 & 0.568 & -1.08\end{bmatrix}$

$\hat{y} = \sigma(z)=$ $\begin{bmatrix} 0.824 & 0.427 & 0.102 & 0.638 & 0.253\end{bmatrix}$

$y=$ $\begin{bmatrix} 0.0 & 1.0 & 1.0 & 0.0 & 1.0\end{bmatrix}$

**Negative log-likelihood** $-(y log(\hat{y}) + (1 - y) log(1 - \hat{y}))=$ $\begin{bmatrix} 1.74 & 0.851 & 2.29 & 1.02 & 1.38\end{bmatrix}$

&nbsp;&nbsp;&nbsp;&nbsp; 0 $ \cdot log($ 0.824 $) + (1 - $ 0 $) \cdot log(1-$ 0.824 $)=-$ 1.74

&nbsp;&nbsp;&nbsp;&nbsp; 1 $ \cdot log($ 0.427 $) + (1 - $ 1 $) \cdot log(1-$ 0.427 $)=-$ 0.851

&nbsp;&nbsp;&nbsp;&nbsp; 1 $ \cdot log($ 0.102 $) + (1 - $ 1 $) \cdot log(1-$ 0.102 $)=-$ 2.29

&nbsp;&nbsp;&nbsp;&nbsp; 0 $ \cdot log($ 0.638 $) + (1 - $ 0 $) \cdot log(1-$ 0.638 $)=-$ 1.02

&nbsp;&nbsp;&nbsp;&nbsp; 1 $ \cdot log($ 0.253 $) + (1 - $ 1 $) \cdot log(1-$ 0.253 $)=-$ 1.38

**BCE** $L(\hat{y}, y)=$ $\begin{bmatrix} 1.74 & 0.851 & 2.29 & 1.02 & 1.38\end{bmatrix}$

**BCELogit** $L(\hat{y}, z)=$ $\begin{bmatrix} 1.74 & 0.851 & 2.29 & 1.02 & 1.38\end{bmatrix}$

# Multiclass case

In [6]:
# Multiclass setting ##########################################################
look("# Multiclass case")
z = torch.randn(5, 3)
look("$z=$", z)
yhat = torch.softmax(z, dim=-1)
look("$\hat{y} = softmax(z)=$", yhat)
y = torch.Tensor([0, 2, 1, 1, 0]).long()
look("$y=$", y)


# First compute the negative log likelihoods using the derived formulat
look("$-log(\hat{y})=$", -yhat.log())
look("$-log(\hat{y})$", "masked with", "$y=$", -yhat.log()[torch.arange(5), y])

look("$-log(softmax(z))=$", -torch.log_softmax(z, dim=-1))
look("$-log(softmax(z))$", "masked with", "$y=$", -torch.log_softmax(z, dim=-1)[torch.arange(5), y])

# Observe that NLLLoss and CrossEntropyLoss can produce the same results
l2_NLLLoss_nored = torch.nn.NLLLoss(reduction="none")(yhat.log(), y)
look("$NLLLoss(log(\hat{y}), y)$", l2_NLLLoss_nored)
l2_CrossEntropyLoss_nored = torch.nn.CrossEntropyLoss(reduction="none")(z, y)
look("$CELoss(z, y)$", l2_CrossEntropyLoss_nored)



# Multiclass case

$z=$ $\begin{bmatrix} 0.0627 & -0.766 & 1.1 \\ 2.76 & 0.175 & -0.932 \\ -1.51 & -0.661 & 1.32 \\ 0.0371 & -0.285 & -0.133 \\ 1.89 & 3.11 & -0.458\end{bmatrix}$

$\hat{y} = softmax(z)=$ $\begin{bmatrix} 0.235 & 0.103 & 0.662 \\ 0.909 & 0.0688 & 0.0227 \\ 0.0494 & 0.115 & 0.836 \\ 0.389 & 0.282 & 0.328 \\ 0.223 & 0.755 & 0.0213\end{bmatrix}$

$y=$ $\begin{bmatrix} 0 & 2 & 1 & 1 & 0\end{bmatrix}$

$-log(\hat{y})=$ $\begin{bmatrix} 1.45 & 2.28 & 0.412 \\ 0.0959 & 2.68 & 3.78 \\ 3.01 & 2.16 & 0.179 \\ 0.943 & 1.27 & 1.11 \\ 1.5 & 0.281 & 3.85\end{bmatrix}$

$-log(\hat{y})$ masked with $y=$ $\begin{bmatrix} 1.45 & 3.78 & 2.16 & 1.27 & 1.5\end{bmatrix}$

$-log(softmax(z))=$ $\begin{bmatrix} 1.45 & 2.28 & 0.412 \\ 0.0959 & 2.68 & 3.78 \\ 3.01 & 2.16 & 0.179 \\ 0.943 & 1.27 & 1.11 \\ 1.5 & 0.281 & 3.85\end{bmatrix}$

$-log(softmax(z))$ masked with $y=$ $\begin{bmatrix} 1.45 & 3.78 & 2.16 & 1.27 & 1.5\end{bmatrix}$

$NLLLoss(log(\hat{y}), y)$ $\begin{bmatrix} 1.45 & 3.78 & 2.16 & 1.27 & 1.5\end{bmatrix}$

$CELoss(z, y)$ $\begin{bmatrix} 1.45 & 3.78 & 2.16 & 1.27 & 1.5\end{bmatrix}$