## The only imports we need

In [22]:
import torch
import torch.nn as nn
import torch.autograd as autograd
from torch.autograd import Variable

## Declare Inputs and targets

Every loss function needs inputs and targets

In [23]:
input_regression = torch.Tensor([1, 2, 3, 4, 5])
target_regression = torch.Tensor([1, 2, 3, 4, 6])

input_classification = torch.Tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3]]).transpose(1, 0)
target_classification = torch.LongTensor([1, 2, 3, 4, 5]) # torch.LongTensor(3).random_(5)

print('input_regression:', input_regression)
print('input_classification:', input_classification)
print('target_regression:', target_regression)
print('target_classification:', target_classification)
# todo print these in numpy or clearer
# todo show math in markdown
# work through each methodically and clearly

# todo: mention "We won't be talking about models here", just purely inputs to loss functions to get the right intuitions
# todo show some loss curves and graphs and landscapes
# todo: stress the importance of this: Oriol says: "Architectures, Losses and inputs/outputs"
# e.g. due to the importance of the losses, I will focus a few videos on this. Anyone who studies and watches these will be very fluent in these.
# I'm also doing this for my own understanding
# todo add more english from official docstrings
# AKA utitlity function, criterion, objective, 

input_regression: 
 1
 2
 3
 4
 5
[torch.FloatTensor of size 5]

input_classification: 
 1  1  1
 2  2  2
 3  3  3
[torch.FloatTensor of size 3x3]

target_regression: 
 1
 2
 3
 4
 6
[torch.FloatTensor of size 5]

target_classification: 
 1
 2
 3
 4
 5
[torch.LongTensor of size 5]



## L1Loss AKA absolute loss AKA Laplace

$$ L = \sum_{i=0}^n \left| y_i - h(x_i) \right|$$

$$\frac{(1 - 1) + (2 - 2) + (3 - 3) + (4 - 4) + (6 - 5)}{5} =\frac{1}{5} = 0.2 $$


In [24]:
# L1Loss AKA absolute loss
#loss = nn.L1Loss(reduce=False) # Shows loss for each single value
#loss = nn.L1Loss()
loss = nn.L1Loss(size_average=False) # Doesn't average by the number of elements
input = autograd.Variable(input_regression, requires_grad=True) # todo requires necessary?
target = autograd.Variable(target_regression) # todo remove "autograd."
print(input, target)
output = loss(input, target)
#output.backward()
print('L1Loss: {}'.format(output))

# todo explain and show output gradients. 
# We want the user to get a real intuition for how the final layer is wrong.
# todo for first loss (this one) show the difference between reduce and size_average and compare with numerator of fraction
# todo add pros and cons for L1 vs L2 Losses
# Todo quantile regression loss and Squared loss (without importance weight aware updates)
# ridge and lasso regression. mention regularisation

Variable containing:
 1
 2
 3
 4
 5
[torch.FloatTensor of size 5]
 Variable containing:
 1
 2
 3
 4
 6
[torch.FloatTensor of size 5]

L1Loss: Variable containing:
 1
[torch.FloatTensor of size 1]



## MSE Loss AKA Euclidean Distance AKA AKA

$$ L = \sum_0^n (y_i - h(x_i))^2 $$

$$\frac{(1 - 1)^2 + (2 - 2)^2 + (3 - 3)^2 + (4 - 4)^2 + (6 - 5)^2}{5} =\frac{1}{5} = 0.2 $$

L2 Loss is top of fraction

In [25]:
# MSE Loss AKA AKA AKA
loss = nn.MSELoss()
input = autograd.Variable(input_regression, requires_grad=True)
target = autograd.Variable(target_regression)
output = loss(input, target)
output.backward()
print('MSELoss: {}'.format(output))

MSELoss: Variable containing:
 0.2000
[torch.FloatTensor of size 1]



In [26]:
# SmoothL1Loss AKA Huber loss # Creates a criterion that uses a squared term if the absolute element-wise error falls below 1 and an L1 term otherwise.

# Classification losses

## CrossEntropyLoss

https://rdipietro.github.io/friendly-intro-to-cross-entropy-loss/

In [60]:
# CrossEntropyLoss
loss = nn.CrossEntropyLoss()
#loss = nn.BCELoss()
#input = autograd.Variable(input_classification, requires_grad=True)
#target = autograd.Variable(target_classification)

import numpy as np

num_rows = 2
num_classes = 5
input = autograd.Variable(torch.randn(num_rows, num_classes).clamp(0.0001, 100), requires_grad=True)
target = autograd.Variable(torch.LongTensor(num_rows).random_(num_classes))
print(input, target)
output = loss(input, target)
output.backward()
#print(target.data.numpy().flatten()[0])
#print(-np.log(input[target.data.numpy().flatten()[0]]))
print('CrossEntropyLoss: {}'.format(output))

Variable containing:
 1.4432e-01  1.8557e+00  5.6689e-01  1.9026e-01  1.0000e-04
 1.0000e-04  2.1835e-01  1.0000e-04  1.0000e-04  7.3342e-02
[torch.FloatTensor of size 2x5]
 Variable containing:
 1
 2
[torch.LongTensor of size 2]

CrossEntropyLoss: Variable containing:
 1.1301
[torch.FloatTensor of size 1]



In [28]:
# NLLLoss
m = nn.LogSoftmax()
loss = nn.NLLLoss()
# input is of size N x C = 3 x 5
input = autograd.Variable(torch.randn(3, 5), requires_grad=True)
# each element in target has to have 0 <= value < C
target = autograd.Variable(torch.LongTensor([1, 0, 4]))
output = loss(m(input), target)
output.backward()
print('NLLLoss: {}'.format(output))

NLLLoss: Variable containing:
 1.7336
[torch.FloatTensor of size 1]





In [29]:
# PoissonNLLLoss # Negative log likelihood loss with Poisson distribution of target.
loss = nn.PoissonNLLLoss()
log_input = autograd.Variable(torch.randn(5, 2), requires_grad=True)
target = autograd.Variable(torch.randn(5, 2))
output = loss(log_input, target)
output.backward()
print('PoissonNLLLoss: {}'.format(output))

PoissonNLLLoss: Variable containing:
 2.4805
[torch.FloatTensor of size 1]



In [30]:
# NLLLoss2d # negative log likehood loss, but for image inputs. It computes NLL loss per-pixel.
m = nn.Conv2d(16, 32, (3, 3)).float()
loss = nn.NLLLoss2d()
# input is of size N x C x height x width
input = autograd.Variable(torch.randn(3, 16, 10, 10))
# each element in target has to have 0 <= value < C
target = autograd.Variable(torch.LongTensor(3, 8, 8).random_(0, 4))
output = loss(m(input), target)
output.backward()
print('NLLLoss2d: {}'.format(output))

NLLLoss2d: Variable containing:
1.00000e-03 *
 -8.1760
[torch.FloatTensor of size 1]



In [31]:
# KLDivLoss # The Kullback-Leibler divergence Loss

In [32]:
# BCELoss # Binary Cross Entropy
m = nn.Sigmoid()
loss = nn.BCELoss()
input = autograd.Variable(torch.randn(3), requires_grad=True)
target = autograd.Variable(torch.FloatTensor(3).random_(2))
output = loss(m(input), target)
output.backward()
print('BCELoss: {}'.format(output))

BCELoss: Variable containing:
 0.5804
[torch.FloatTensor of size 1]



In [33]:
# BCEWithLogitsLoss # This loss combines a Sigmoid layer and the BCELoss in one single class

In [34]:
# MarginRankingLoss # Creates a criterion that measures the loss given inputs x1, x2, two 1D mini-batch Tensor`s, and a label 1D mini-batch tensor `y with values (1 or -1).


In [35]:
# HingeEmbeddingLoss
'''                 { x_i,                  if y_i ==  1
loss(x, y) = 1/n {
                    { max(0, margin - x_i), if y_i == -1'''

'                 { x_i,                  if y_i ==  1\nloss(x, y) = 1/n {\n                    { max(0, margin - x_i), if y_i == -1'

In [36]:
#MultiLabelMarginLoss # multi-class multi-classification hinge loss (margin-based loss) 
# loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0)

In [37]:
# SoftMarginLoss # two-class classification logistic loss

In [38]:
# MultiLabelSoftMarginLoss # multi-label one-versus-all loss based on max-entropy

In [39]:
# CosineEmbeddingLoss

In [40]:
# MultiMarginLoss

In [41]:
# TripletMarginLoss
'''triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
input1 = autograd.Variable(torch.randn(100, 128))
input2 = autograd.Variable(torch.randn(100, 128))
input3 = autograd.Variable(torch.randn(100, 128))
output = triplet_loss(input1, input2, input3)
output.backward()'''

'triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)\ninput1 = autograd.Variable(torch.randn(100, 128))\ninput2 = autograd.Variable(torch.randn(100, 128))\ninput3 = autograd.Variable(torch.randn(100, 128))\noutput = triplet_loss(input1, input2, input3)\noutput.backward()'

# Loss functions in every single relevant framework

You should now understand 90% of the loss functions in the below frameworks. 

#### PyTorch Losses
http://pytorch.org/docs/master/nn.html#loss-functions

#### Torch Losses
https://github.com/torch/nn/blob/master/doc/criterion.md

#### Keras Losses
https://keras.io/losses/

#### TensorFlow Losses
https://www.tensorflow.org/api_docs/python/tf/losses

#### Gluon/MXNet Losses
https://mxnet.incubator.apache.org/api/python/gluon/loss.html

#### Chainer Losses
http://docs.chainer.org/en/stable/reference/functions.html#loss-functions

#### Caffe2 Losses
Couldn't find a good and simple list for Caffe or Caffe2

#### CNTK Losses
https://docs.microsoft.com/en-us/cognitive-toolkit/Loss-Functions-and-Metrics

#### DeepLearning4j Losses
https://deeplearning4j.org/features#lossobjective-functions

#### Lasagne Losses
http://lasagne.readthedocs.io/en/latest/modules/objectives.html

#### PaddlePaddle Losses
http://paddlepaddle.org/docs/develop/api/en/v2/config/layer.html?highlight=loss#cost-layers

### Other Resources:

https://en.wikipedia.org/wiki/Loss_function  
https://en.wikipedia.org/wiki/Loss_functions_for_classification  
http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html  
https://davidrosenberg.github.io/ml2015/docs/3a.loss-functions.pdf  