This notebook gives a shallow comparaison the Cosine loss and the standard CrossEntropy loss, in terms of convergence speed.   
Both loss function try to minimize the loss of, let's say, only the last layer of a network.

In [5]:
''' Implementation of loss function that is based on the cosine function.
    It could be used instead of CrossEntropy loss function.
'''

from typing import Optional
from numpy import pi, cos
import torch
from torch import nn
from torch.nn.functional import nll_loss, softmax

@torch.jit.script
def cosine_loss(y: torch.Tensor, inds: torch.Tensor, alpha: float, beta: float, gamma: float, reduction: str, pi:float=pi):
    y = softmax(y, dim=1)
    y = - nll_loss(y, inds, reduction='none')
    y = torch.cos(pi/2 * alpha * y + beta) + gamma

    if reduction == 'sum':
        return y.sum()
    elif reduction == 'mean':
        return y.mean()
    return y


class Cosine(nn.Module):

    def __init__(self, alpha: float = .9, beta: float = 1., reduction: Optional[str] = 'mean'):
        ''' Initialisation of the loss function.
            The formula is :
                loss = torch.cos(HALF_PI * alpha * y + beta) + gamma
            where: gamma is a float that makes sure the loss is always positive, with loss(1)==0.
        Args: 
            - alpha, beta (floats): Two positive floats used to control: gradient step near 0 and 1.
                                    They need to be set carefully; always make sure that: 'beta <= pi*(1-alpha/2).'
            - reduction (string, optional): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. 
                                            'none': no reduction will be applied, 
                                            'mean': the mean of the output is taken, 
                                            'sum': the output will be summed.
        '''
        super().__init__()

        # assert alpha > 0 and beta > 0 and beta <= pi * \
        #     (1 - alpha /
        #      2), f'Make sure: {alpha=}>0, {beta=}>0, and {beta=}<{pi*(1-alpha/2)=} '

        self.reduction = reduction
        self.alpha = alpha
        self.beta = beta
        # Assert loss(1)==0
        self.gamma = - cos(pi/2 * alpha + beta)

    def forward(self, y: torch.Tensor, inds: torch.Tensor):
        ''' Compute the inverse sigmoid loss function

        Args:
            - y: A Float tensor with a shape (batch, C, d1, d2, ..., dK), C: number of classes.
            - inds: A Long tensor of indices, with a shape (batch, d1, d2, ..., dK), 
                    where each entry is non-negative and smaller than C.

        Returns:
            - loss: depending on the reduction, the resulting tensor could be a scalar tensor, 
                    or a tensor with the same shape as 'inds' if 'reduction' is 'none'.
        '''
        return cosine_loss(y, inds, self.alpha, self.beta, self.gamma, self.reduction)


time: 33.5 ms (started: 2021-06-28 18:55:52 +00:00)


In [6]:
_a = torch.rand(4, 5)
b = torch.tensor([1, 2, 3, 0])
_a

tensor([[0.7448, 0.4770, 0.9744, 0.7301, 0.8500],
        [0.6688, 0.7645, 0.6790, 0.0454, 0.1290],
        [0.4919, 0.2358, 0.9810, 0.9704, 0.2762],
        [0.3241, 0.8553, 0.7690, 0.9935, 0.6545]])

time: 221 ms (started: 2021-06-28 18:55:57 +00:00)


Making sure both loss function start from the same point in space.

In [8]:
loss1 = Cosine()
params1 = nn.Parameter(_a.clone().detach())

time: 3.65 ms (started: 2021-06-28 18:56:05 +00:00)


In [9]:
loss2 = nn.CrossEntropyLoss()
params2 = nn.Parameter(_a.clone().detach())

time: 1.93 ms (started: 2021-06-28 18:56:07 +00:00)


In [10]:
from tqdm.notebook import tqdm

def train(loss, params, epochs):
    opt = torch.optim.Adam([params], )
    with tqdm(total=epochs) as pbar:
        for i in range(epochs):
            opt.zero_grad()
            l = loss(params, b)
            l.backward()
            opt.step()
            if i%1000 == 0:
                print('loss =', l.item())
            pbar.update(1)
            pbar.set_postfix({'loss':l.item()})

time: 5.68 ms (started: 2021-06-28 18:56:08 +00:00)


In [11]:
epochs = 5000

time: 1.18 ms (started: 2021-06-28 18:56:11 +00:00)


In [12]:
train(loss1, params1, epochs)

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))

loss = 1.0307177305221558
loss = 0.3706229329109192
loss = 0.1132519394159317
loss = 0.0504530668258667
loss = 0.026487275958061218

time: 18.7 s (started: 2021-06-28 18:56:13 +00:00)


In [13]:
train(loss2, params2, epochs)

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))

loss = 1.6613107919692993
loss = 0.5666051506996155
loss = 0.21057072281837463
loss = 0.09622810781002045
loss = 0.05013938248157501

time: 17.3 s (started: 2021-06-28 18:56:43 +00:00)


* CE Loss yielded a solution with a 0.05 loss, but the solution of the Cosine loss has 0.016 value.
* Likewise, if we compare the Cosine loss values of both solution, Cosine's solution is better.

In [17]:
loss2(params1, b), params1

(tensor(0.0159, grad_fn=<NllLossBackward>), Parameter containing:
 tensor([[-2.1671,  3.3889, -1.9374, -2.1818, -2.0619],
         [-1.9968, -1.9011,  3.3446, -2.6202, -2.5366],
         [-2.0952, -2.3512, -1.6061,  3.5575, -2.3109],
         [ 3.3023, -2.1229, -2.2092, -1.9848, -2.3238]], requires_grad=True))

time: 6.32 ms (started: 2021-06-28 18:57:55 +00:00)


In [16]:
loss1(params2, b), params2

(tensor(0.0266, grad_fn=<MeanBackward0>), Parameter containing:
 tensor([[-1.8339,  3.0556, -1.6042, -1.8486, -1.7286],
         [-1.7676, -1.6719,  3.1154, -2.3910, -2.3074],
         [-1.9003, -2.1563, -1.4112,  3.3626, -2.1159],
         [ 2.9421, -1.7627, -1.8490, -1.6246, -1.9636]], requires_grad=True))

time: 7.28 ms (started: 2021-06-28 18:57:53 +00:00)
