In [1]:
import numpy as np
import torch
import pandas as pd
from IPython.display import display

min_value = 0.
max_value = 10.
batch_size = 12

In [2]:
targets = torch.linspace(min_value, max_value, steps=batch_size)

## MSE

In [3]:
predictions = torch.empty_like(targets).uniform_(min_value, max_value)
predictions[2] = targets[2]
predictions[6] = targets[6]

display(pd.DataFrame({
    'pred': predictions,
    'target': targets,
    'squared_error': (targets-predictions).pow(2),
    'squared_error_pt': torch.nn.functional.mse_loss(predictions, targets, reduction='none')
}).round(2).style.hide_index())
print('MSE', torch.nn.functional.mse_loss(predictions, targets, reduction='mean').item())

pred,target,squared_error,squared_error_pt
9.77,0.0,95.4,95.4
7.48,0.91,43.19,43.19
1.82,1.82,0.0,0.0
9.08,2.73,40.36,40.36
3.56,3.64,0.01,0.01
1.81,4.55,7.5,7.5
5.45,5.45,0.0,0.0
1.25,6.36,26.12,26.12
0.61,7.27,44.39,44.39
2.74,8.18,29.57,29.57


MSE 27.82337760925293


## Hard classification

$$\text{Entropy}(p) = - \sum_i p_i \log p_i$$
$$\text{CrossEntropy}(p, q) = - \sum_i p_i \log q_i$$

In [4]:
def entropy(p):
    log_p = torch.log(p).clamp(min=-1e16)
    return - torch.sum(p * log_p, dim=-1)

def cross_entropy(p, q):
    log_q = torch.log(q).clamp(min=-1e16)
    prod = torch.where(p == 0, torch.tensor(0.), p * log_q)
    return - prod.sum(dim=-1)

def KL(p, q):
    log_q_p = torch.log(q).clamp(min=-1e16) - torch.log(p).clamp(min=-1e16)
    prod = torch.where(p == 0, torch.tensor(0.), p * log_q_p)
    return - prod.sum(dim=-1)

In [5]:
num_bins = 5
bins = np.linspace(min_value, max_value, num=num_bins, endpoint=False)
target_idx = torch.from_numpy(np.digitize(targets, bins=bins) - 1)

predictions = torch.rand(len(targets), num_bins).softmax(dim=1)
predictions[2] = 0
predictions[2, target_idx[2]] = 1
predictions[6] = 0
predictions[6, target_idx[6]] = 1
pred_for_target = torch.gather(predictions, dim=1, index=target_idx.view(-1, 1)).squeeze()

display(pd.DataFrame({
    **{('pred', i): col for i, col in enumerate(predictions.unbind(dim=1))},
    ('pred', 'entropy'): entropy(predictions),
    ('target', ''): targets,
    ('target_idx', ''): target_idx,
    ('pred[target_idx]', ''): pred_for_target,
    ('cross_entropy', 'manual'): - pred_for_target.log(),
    ('cross_entropy', 'pytorch'): torch.nn.functional.nll_loss(predictions.log(), target_idx, reduction='none'),
}).round(3).style.hide_index())

print('Cross Entropy', torch.nn.functional.nll_loss(predictions.log(), target_idx, reduction='mean').item())

pred,pred,pred,pred,pred,pred,target,target_idx,pred[target_idx],cross_entropy,cross_entropy
0,1,2,3,4,entropy,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,manual,pytorch
0.213,0.195,0.253,0.161,0.178,1.597,0.0,0,0.213,1.546,1.546
0.232,0.173,0.164,0.268,0.163,1.587,0.909,0,0.232,1.461,1.461
1.0,0.0,0.0,0.0,0.0,-0.0,1.818,0,1.0,-0.0,-0.0
0.232,0.271,0.138,0.171,0.188,1.583,2.727,1,0.271,1.307,1.307
0.174,0.132,0.216,0.262,0.216,1.585,3.636,1,0.132,2.022,2.022
0.157,0.268,0.132,0.243,0.199,1.577,4.545,2,0.132,2.021,2.021
0.0,0.0,1.0,0.0,0.0,-0.0,5.455,2,1.0,-0.0,-0.0
0.164,0.283,0.153,0.159,0.242,1.576,6.364,3,0.159,1.837,1.837
0.145,0.357,0.143,0.179,0.176,1.54,7.273,3,0.179,1.722,1.722
0.251,0.168,0.241,0.172,0.167,1.592,8.182,4,0.167,1.787,1.787


Cross Entropy 1.3958719968795776


## Soft classification

$$\text{KL}(p, q) = - \sum_i p_i \log\frac{q_i}{p_i} = - \sum_i p_i (\log q_i - \log p_i) = \text{CrossEntropy}(p, q) - \text{Entropy}(p)$$

- If $p$ is one-hot encoded, $\text{Entropy}(p)=0$ so using KL or CrossEntropy is the same
- If $p$ uses soft targets, KL is better because it "removes" from the loss the irreducible part that is $\text{Entropy}(p)$. In this way, if the distributions are identical $\text{KL}(p,q)=0$

In [6]:
num_bins = 5
bins = np.linspace(min_value, max_value, num=num_bins, endpoint=False)
target_idx = torch.from_numpy(np.digitize(targets, bins=bins) - 1)

targets_onehot = torch.zeros(len(targets), num_bins)
targets_onehot.scatter_(src=torch.tensor(1), dim=1, index=target_idx.view(-1, 1));

In [7]:
predictions = torch.rand(len(targets), num_bins).softmax(dim=1)
predictions[2] = targets_onehot[2]
predictions[7] = (1 - .7) / (num_bins - 1)
predictions[7, target_idx[7]] = .7
predictions[8] = (1 - .9) / (num_bins - 1)
predictions[8, target_idx[8]] = .9

display(pd.DataFrame({
    **{('pred', i): col for i, col in enumerate(predictions.unbind(dim=1))},
    ('pred', 'entropy'): entropy(predictions),
    **{('target', i): col for i, col in enumerate(targets_onehot.unbind(dim=1))},
    ('target', 'entropy'): entropy(targets_onehot),
    ('cross_entropy', 'manual'): - (targets_onehot * torch.log(predictions).clamp(min=-1e16)).sum(dim=1),
    ('cross_entropy', 'pytorch'): torch.nn.functional.nll_loss(predictions.log(), target_idx, reduction='none'),
    ('KL', 'manual'): KL(targets_onehot, predictions),
    ('KL', 'pytorch'): torch.nn.functional.kl_div(predictions.log(), targets_onehot, reduction='none').sum(dim=1),
}).style
    .set_precision(3)
    .hide_index()
    .background_gradient(cmap='hot_r', axis=0, subset=[('pred', i) for i in range(num_bins)], low=0, high=1)
    .background_gradient(cmap='hot_r', axis=0, subset=[('target', i) for i in range(num_bins)], low=0, high=1)
    .set_properties(**{'border-right': '10px solid black'}, subset=[('pred', 'entropy')])
)

print('KL', torch.nn.functional.kl_div(predictions.log(), targets_onehot, reduction='batchmean').item())

pred,pred,pred,pred,pred,pred,target,target,target,target,target,target,cross_entropy,cross_entropy,KL,KL
0,1,2,3,4,entropy,0,1,2,3,4,entropy,manual,pytorch,manual,pytorch
0.168,0.288,0.167,0.216,0.161,1.58,1,0,0,0,0,0,1.78,1.78,1.78,1.78
0.199,0.126,0.221,0.29,0.163,1.57,1,0,0,0,0,0,1.61,1.61,1.61,1.61
1.0,0.0,0.0,0.0,0.0,-0.0,1,0,0,0,0,0,-0.0,-0.0,-0.0,0.0
0.247,0.169,0.116,0.242,0.225,1.58,0,1,0,0,0,0,1.78,1.78,1.78,1.78
0.179,0.237,0.125,0.258,0.202,1.58,0,1,0,0,0,0,1.44,1.44,1.44,1.44
0.186,0.278,0.17,0.176,0.19,1.59,0,0,1,0,0,0,1.77,1.77,1.77,1.77
0.246,0.219,0.229,0.154,0.153,1.59,0,0,1,0,0,0,1.47,1.47,1.47,1.47
0.075,0.075,0.075,0.7,0.075,1.03,0,0,0,1,0,0,0.357,0.357,0.357,0.357
0.025,0.025,0.025,0.9,0.025,0.464,0,0,0,1,0,0,0.105,0.105,0.105,0.105
0.234,0.127,0.188,0.139,0.311,1.55,0,0,0,0,1,0,1.17,1.17,1.17,1.17


KL 1.230258822441101


In [8]:
targets_soft = torch.conv1d(
    torch.nn.functional.pad(targets_onehot[:,None,:], (1,1), mode='replicate'), 
    weight=torch.tensor([0.27901, 0.44198, 0.27901])[None, None, :]
).squeeze(1)
targets_soft[10] = 0
targets_soft[10, target_idx[10]] = 1

In [9]:
predictions = torch.rand(len(targets), num_bins).softmax(dim=1)
predictions[2] = (1 - .7) / (num_bins - 1)
predictions[2, target_idx[2]] = .7
predictions[6] = (1 - .9) / (num_bins - 1)
predictions[6, target_idx[6]] = .9
predictions[8] = targets_soft[8]

display(pd.DataFrame({
    **{('pred', i): col for i, col in enumerate(predictions.unbind(dim=1))},
    ('pred', 'entropy'): - (predictions * torch.log(predictions).clamp(min=-1e16)).sum(dim=1),
    **{('target', i): col for i, col in enumerate(targets_soft.unbind(dim=1))},
    ('target', 'entropy'): - (targets_soft * torch.log(targets_soft).clamp(min=-1e16)).sum(dim=1),
    ('cross_entropy', 'manual'): - (targets_soft * torch.log(predictions).clamp(min=-1e16)).sum(dim=1),
    ('KL', 'manual'): - (targets_soft * (torch.log(predictions).clamp(min=-1e16) - torch.log(targets_soft).clamp(min=-1e16))).sum(dim=1),
    ('KL', 'pytorch'): torch.nn.functional.kl_div(predictions.log(), targets_soft, reduction='none').sum(dim=1),
}).style
    .set_precision(3)
    .hide_index()
    .background_gradient(cmap='hot_r', axis=0, subset=[('pred', i) for i in range(num_bins)], low=0, high=1)
    .background_gradient(cmap='hot_r', axis=0, subset=[('target', i) for i in range(num_bins)], low=0, high=1)
)

print('KL', torch.nn.functional.kl_div(predictions.log(), targets_soft, reduction='batchmean').item())

pred,pred,pred,pred,pred,pred,target,target,target,target,target,target,cross_entropy,KL,KL
0,1,2,3,4,entropy,0,1,2,3,4,entropy,manual,manual,pytorch
0.122,0.218,0.234,0.235,0.191,1.59,0.721,0.279,0.0,0.0,0.0,0.592,1.94,1.35,1.35
0.296,0.158,0.198,0.154,0.193,1.58,0.721,0.279,0.0,0.0,0.0,0.592,1.39,0.799,0.799
0.7,0.075,0.075,0.075,0.075,1.03,0.721,0.279,0.0,0.0,0.0,0.592,0.98,0.388,0.388
0.235,0.186,0.169,0.217,0.194,1.6,0.279,0.442,0.279,0.0,0.0,1.07,1.65,0.572,0.572
0.188,0.258,0.2,0.185,0.169,1.6,0.279,0.442,0.279,0.0,0.0,1.07,1.51,0.441,0.441
0.145,0.218,0.23,0.223,0.184,1.6,0.0,0.279,0.442,0.279,0.0,1.07,1.49,0.42,0.42
0.025,0.025,0.9,0.025,0.025,0.464,0.0,0.279,0.442,0.279,0.0,1.07,2.11,1.03,1.03
0.245,0.152,0.119,0.232,0.252,1.57,0.0,0.0,0.279,0.442,0.279,1.07,1.62,0.551,0.551
0.0,0.0,0.279,0.442,0.279,1.07,0.0,0.0,0.279,0.442,0.279,1.07,1.07,-0.0,0.0
0.113,0.136,0.258,0.277,0.216,1.55,0.0,0.0,0.0,0.279,0.721,0.592,1.46,0.869,0.869


KL 0.725271463394165


In [10]:
import pandas as pd
frequencies_local = pd.read_pickle('../data/training/frequencies_local.pkl')
frequencies_local

(-0.001, 0.05]     42477
(0.05, 0.1]       331950
(0.1, 0.15]       634572
(0.15, 0.2]       805349
(0.2, 0.25]       848747
(0.25, 0.3]       809202
(0.3, 0.35]       769907
(0.35, 0.4]       736137
(0.4, 0.45]       718695
(0.45, 0.5]       717642
(0.5, 0.55]       759861
(0.55, 0.6]       854934
(0.6, 0.65]       937608
(0.65, 0.7]       971129
(0.7, 0.75]       896157
(0.75, 0.8]       699870
(0.8, 0.85]       437687
(0.85, 0.9]       200822
(0.9, 0.95]        45025
(0.95, 1.0]        52314
Name: local_scores, dtype: int64

In [11]:
import numpy as np
bins = np.linspace(0, 1, num=5, endpoint=False)
print(bins)
indexes = np.searchsorted(v=x, a=bins, side='right') - 1
print(indexes)

print(np.array(x))
print(bins[indexes])

[0.  0.2 0.4 0.6 0.8]


NameError: name 'x' is not defined

In [12]:
frequencies_local.loc[torch.tensor([.11, .22])]

(0.1, 0.15]    634572
(0.2, 0.25]    848747
Name: local_scores, dtype: int64

In [12]:
torch.nn.functional.cross_entropy(
input=torch.tensor([
    [1, 5, 1.],
    [1, 2, 5],
    [5, 2, 1]
]),
target=torch.tensor([1, 2, -1]),
reduction='mean',
ignore_index=-1
)

tensor(0.0509)

In [11]:
class MseLoss(torch.nn.Module):
    """
    NaN targets are ignored when computing the mean
    """
    def forward(self, input, target, weight=None):
        losses = torch.nn.functional.mse_loss(input, target, reduction='none')
        if weight is not None:
            losses *= weight
        loss = torch.mean(losses[torch.isfinite(target)])
        return loss

target = torch.tensor([0., 0.15, np.nan, 0.25, 0.9, 1.])
pred = torch.zeros_like(target)

MseLoss()(pred, target)

tensor(0.3790)

In [17]:
class OneHotCrossEntropyLoss(torch.nn.Module):
    """
    Logits are unnormalized probabilities.
    Targets should be real valued, they will be bucketized on the fly.
    NaN targets will end up in the last bin, but then ignored when computing the mean.
    """
    def __init__(self, num_bins):
        super().__init__()
        self.bins = np.linspace(0, 1, num=num_bins, endpoint=False)
        
    def forward(self, logits, target, weight=None):
        target_bucketized = torch.from_numpy(np.searchsorted(v=target, a=self.bins, side='right') - 1)
        # cross_entropy = nll_loss(log_softmax(logits))
        losses = torch.nn.functional.cross_entropy(logits, target_bucketized, reduction='none')
        if weight is not None:
            losses *= weight
        loss = torch.mean(losses[torch.isfinite(target)])
        return loss

num_bins = 5
target = torch.tensor([0., 0.15, np.nan, 0.25, 0.9, 1.])
logits = torch.tensor([
    [0.3439, 0.5635, 0.0241, 0.4725, 0.1705],
    [0.6080, 0.6916, 0.8956, 0.8720, 0.1840],
    [0.1384, 0.1836, 0.4998, 0.7758, 0.6725],
    [0.0588, 0.5167, 0.9170, 0.0974, 0.4822],
    [0.2524, 0.0174, 0.2271, 0.6813, 0.1437],
    [0.5190, 0.6100, 0.2292, 0.7792, 0.5570]
])

OneHotCrossEntropyLoss(num_bins)(logits, target)

tensor(1.6406)

In [19]:
class SmoothedKLDivLoss(torch.nn.Module):
    def __init__(self, num_bins, smooth):
        super().__init__()
        self.num_bins = num_bins
        self.bins = np.linspace(0, 1, num=5, endpoint=False)
        self.smooth = smooth
        
    def forward(self, logits, target, weight=None):
        target_bucketized = torch.from_numpy(np.searchsorted(v=target, a=self.bins, side='right') - 1)
        target_smooth = torch \
            .full_like(logits, fill_value=self.smooth / self.num_bins) \
            .scatter_(dim=1, index=target_bucketized.unsqueeze(1), value=1 - self.smooth * (1 - 1 / self.num_bins))
        print(target)
        print(target_bucketized)
        print(target_smooth)
        losses = torch.nn.functional.kl_div(logits.log_softmax(dim=1), target_smooth, reduction='none').sum(dim=1)
        print(losses)
        if weight is not None:
            losses *= weight
        loss = torch.mean(losses[torch.isfinite(target)])
        return loss

weight = .2
num_bins = 5
target = torch.tensor([0., 0.15, np.nan, 0.25, 0.9, 1.])
logits = torch.tensor([
    [0.9439, 0.1635, 0.0241, 0.1725, 0.1705],
    [0.6080, 0.6916, 0.8956, 0.8720, 0.1840],
    [0.1384, 0.1836, 0.4998, 0.7758, 0.6725],
    [0.0588, 0.5167, 0.9170, 0.0974, 0.4822],
    [0.2524, 0.0174, 0.2271, 0.6813, 0.1437],
    [0.5190, 0.6100, 0.2292, 0.7792, 0.5570]
])

SmoothedKLDivLoss(num_bins, weight)(logits, target)

tensor([0.0000, 0.1500,    nan, 0.2500, 0.9000, 1.0000])
tensor([0, 0, 4, 1, 4, 4])
tensor([[0.8400, 0.0400, 0.0400, 0.0400, 0.0400],
        [0.8400, 0.0400, 0.0400, 0.0400, 0.0400],
        [0.0400, 0.0400, 0.0400, 0.0400, 0.8400],
        [0.0400, 0.8400, 0.0400, 0.0400, 0.0400],
        [0.0400, 0.0400, 0.0400, 0.0400, 0.8400],
        [0.0400, 0.0400, 0.0400, 0.0400, 0.8400]])
tensor([0.4910, 1.0122, 0.8053, 0.9168, 1.0714, 0.9489])


tensor(0.8881)

In [85]:
torch.nn.functional.cross_entropy(
input=torch.tensor([
    [1, 5, 1.],
    [1, 2, 5],
    [5, 2, 1]
]),
target=torch.tensor([1, 2, -1]),
reduction='none',
ignore_index=-1
).mean()

tensor(0.0340)

In [None]:
def mse(pred, target):
    return torch.nn.functional.mse_loss(pred, target, reduction='none')

def cross_entropy_onehot(pred, target)
    bins = np.linspace(0, 1, num=5, endpoint=False)
    target = np.searchsorted(v=target, a=bins, side='right') - 1
    return torch.nn.functional.cross_entropy(pred, target, reduction='none')

pred = torch.tensor([0., .15, .17, .27, .98, 1.])
target = torch.tensor([0., .15, .17, .27, .98, 1.])