In [1]:
import numpy as np
from scipy.optimize import linear_sum_assignment

In [2]:
def stabilize(confidence, very_small_value=1e-8, dtype=np.float32):
    confidence[confidence > 1.0]  = 1.0
    confidence[confidence <= 0.0]  = very_small_value
    return confidence.astype(dtype)

In [None]:
    """Loss function for free-form text detection as per the paper:
       https://www.teklia.com/wp-content/uploads/2018/07/IJDAR2018_publishedVersion.pdf

    :param predictions:
    :param truths:
    :param alpha1:
    :param alpha2:
    :return:
    """
    
ENH: Loss function for free-form text detection as per the Moysset et.al. paper.

In [None]:

def lossx(predictions, truths, alpha1=1.0, alpha2=1.0):
    """
    predictions is a float matrix with the dimensions [batch_size, num_predictions, 5]
        - the '5' represents x, y, height, width & confidence of each prediction of each batch item
        
    truths is a float matrix with the dimensions [batch_size, num_ground_truths, 4]
        - the '4' represents x, y, height & width of each ground truth of each batch item
    
    for all items in the batch, num_predictions will be the same 
        (as determined by the image sizes in the batch and the number of bounding 
        boxes we want to predict per location)
        
    for all items in the batch, num_ground_truths will be different for different images
    
    """
    
    assert len(predictions) == len(truths), 'Batch size of predictions and ground truths do not match'
        
    predictions = np.array(predictions)
    truths      = np.array(truths)
    batch_size  = len(predictions)
    the_loss    = None
    
    for batch_idx in range(batch_size):
        
        # Create the cost matrix: predictions vs ground truths
        ######################################################
        
        num_preds = len(predictions[batch_idx])
        num_truths = len(truths[batch_idx])
        
        if 0 == num_preds:
            continue
            
        # For numerical stability
        predictions[batch_idx][:, 4] = stabilize(predictions[batch_idx][:, 4])
        
        # extract (1-confidence) values
        one_minus_confidence = 1.0 - predictions[batch_idx][:, 4]
        
        # For numerical stability
        one_minus_confidence = stabilize(one_minus_confidence)
        
        if num_truths == 0:
            # If there are no ground truths, we want all the predictions to have very low 
            # confidence (and that's all for the loss for this batch)
            
            if the_loss is None:
                the_loss = 0.0
                
            the_loss += -np.log(one_minus_confidence).sum()
            continue
        
        # the cost matrix, initialized to 0
        cost_matrix = np.zeros((num_preds, num_truths))

        # fill the cost matrix
        for t in range(num_truths):
            localization_cost = np.linalg.norm(predictions[batch_idx][:][:, :4] - truths[batch_idx][t], 
                                               axis=1) * alpha1
            confidence_cost = -1 * np.log(predictions[batch_idx][:][:, 4])
            cost_matrix[:, t] = localization_cost + confidence_cost
        
        # run the hungarian algorithm
        row_ind, col_ind = linear_sum_assignment(cost_matrix) 
        
        # generate assignment lookup table
        X = np.full(cost_matrix.shape, False, dtype=bool)
        X[row_ind, col_ind] = True
        
        # Calculate the loss
        ######################################################
        
        # which predictions map to a (any) ground truth?
        prediction_mask = X.sum(axis=1, dtype=np.bool)

        if the_loss is None:
            the_loss = 0.0
            
        # for all predictions which did not map to any truth, we want their confidence to be low
        the_loss += -np.log(one_minus_confidence[~prediction_mask]).sum()
        
        # now add cost of the matches
        for t in range(num_truths):
            localization_cost = np.linalg.norm(predictions[batch_idx][X[:, t]][:, :4] - truths[batch_idx][t],  
                                               axis=1) * alpha2
            confidence_cost = -1 * np.log(predictions[batch_idx][X[:, t]][:, 4])
            the_loss += np.squeeze(localization_cost + confidence_cost)
            
    return the_loss if the_loss is not None else float('inf')

In [3]:

def detection_loss(predictions, truths, alpha1=1.0, alpha2=1.0):
    """
    predictions is a float matrix with the dimensions [batch_size, num_predictions, 5]
        - the '5' represents x, y, height, width & confidence of each prediction of each batch item
    truths is a float matrix with the dimensions [batch_size, num_ground_truths, 4]
        - the '4' represents x, y, height & width of each ground truth of each batch item
    for all items in the batch, num_predictions will be the same
        (as determined by the image sizes in the batch and the number of bounding
        boxes we want to predict per location)
    for all items in the batch, num_ground_truths will be different for different images
    """

    assert len(predictions) == len(truths), 'Batch size of predictions and ground truths do not match'

    predictions = np.array(predictions)
    truths = np.array(truths)
    batch_size = len(predictions)
    the_loss = None

    for batch_idx in range(batch_size):

        # Create the cost matrix: predictions vs ground truths
        ######################################################
        num_preds = len(predictions[batch_idx])
        num_truths = len(truths[batch_idx])

        if 0 == num_preds:
            continue

        # For numerical stability
        predictions[batch_idx][:, 4] = stabilize(predictions[batch_idx][:, 4])

        # extract (1-confidence) values
        one_minus_confidence = 1.0 - predictions[batch_idx][:, 4]

        # For numerical stability
        one_minus_confidence = stabilize(one_minus_confidence)

        if num_truths == 0:
            # If there are no ground truths, we want all the predictions to have very low 
            # confidence (and that's all for the loss for this batch)
            if the_loss is None:
                the_loss = 0.0
            the_loss += -np.log(one_minus_confidence).sum()
            continue

        # the cost matrix, initialized to 0
        cost_matrix = np.zeros((num_preds, num_truths))

        # fill the cost matrix
        for t in range(num_truths):
            localization_cost = np.linalg.norm(predictions[batch_idx][:][:, :4] - truths[batch_idx][t],
                                               axis=1) * alpha1
            confidence_cost = -1 * np.log(predictions[batch_idx][:][:, 4])
            cost_matrix[:, t] = localization_cost + confidence_cost

        # run the hungarian algorithm
        row_ind, col_ind = linear_sum_assignment(cost_matrix)

        # generate assignment lookup table
        assignment = np.full(cost_matrix.shape, False, dtype=bool)
        assignment[row_ind, col_ind] = True

        # Calculate the loss
        ######################################################

        # which predictions map to a (any) ground truth?
        prediction_mask = assignment.sum(axis=1, dtype=np.bool)

        if the_loss is None:
            the_loss = 0.0

        # for all predictions which did not map to any truth, we want their confidence to be low
        the_loss += -np.log(one_minus_confidence[~prediction_mask]).sum()

        # now add cost of the matches
        for t in range(num_truths):
            localization_cost = np.linalg.norm(predictions[batch_idx][assignment[:, t]][:, :4] - truths[batch_idx][t],
                                               axis=1) * alpha2
            confidence_cost = -1 * np.log(predictions[batch_idx][assignment[:, t]][:, 4])
            the_loss += np.squeeze(localization_cost + confidence_cost)

    return the_loss if the_loss is not None else float('inf')


In [4]:
p = np.array([[[10, 20, 30, 40, 0.001], [1, 2, 3, 4, .001]]])
g = np.array([])
detection_loss(p, g)

AssertionError: Batch size of predictions and ground truths do not match

In [5]:
p = np.array([[[10, 20, 30, 40, 0.001], [1, 2, 3, 4, .001]]])
g = np.array([[]])

l = detection_loss(p, g)
print(l)
print(-np.log(1- np.squeeze(p)[:, 4]).sum())

0.0020009749568998814
0.0020010006671670687


In [6]:
g = np.array([[[10, 20, 30, 40], [1, 2, 3, 4]]])
p = np.array([[]])

l = detection_loss(p, g)
print(l)


inf


In [22]:
p = np.array([[[10, 20, 30, 40, 0.0], [1, 2, 3, 4, 1]]])
g = np.array([[[1., 2, 3, 4.]]])

calculated = np.round(detection_loss(p, g), 4)
expected = 0.0

assert calculated == expected



0.0 0.0


In [26]:
prediction = np.array([[[10, 20, 30, 40, .9], [1, 2, 3, 40, .9]]])
truth = np.array([[[1., 2, 3, 4.]]])

calculated = detection_loss(prediction, truth)
expected = np.linalg.norm(truth[0][0] - prediction[0][1][:4]) + -np.log(prediction[0][1][4]) + -np.log(1 - prediction[0][0][4])
print(expected, calculated)

38.40794560865187 38.40794542869968


In [30]:
prediction = np.array([[[10, 20, 30, 40, 0.0001], [1, 2, 3, 4, .999]], 
              [[1, 1, 1, 2, .98], [3, 4, 5, 6, .88]]])

truth = np.array([[[1., 2, 3, 4.]], 
              [[1, 1, 1, 2], [3, 4, 5, 6]]])

calculated = detection_loss(p, g)
expected = np.linalg.norm(truth[0][0] - prediction[0][1][:4]) + -np.log(prediction[0][1][4]) + -np.log(1 - prediction[0][0][4])
expected += np.linalg.norm(truth[1][0] - prediction[1][0][:4]) + -np.log(prediction[1][0][4])
expected += np.linalg.norm(truth[1][1] - prediction[1][1][:4]) + -np.log(prediction[1][1][4])

print(calculated, expected)


0.1491365738218763 0.14913658416132122


In [None]:
p = np.array([[[1, 1, 1, 2, 0.9], [3, 4, 5, 6, .9], [33, 44, 55, 66, .01]]])
g = np.array([[[1, 1, 1, 2], [3, 4, 5, 6]]])

l = loss(p, g, alpha1=1)
print(l)