Functions

In [103]:
import math

def score_dcg(y, sorted1, k=5, option="sum"):
    """ DCG: Discount Cumulative Gain
    y: labels (0,1)
         - also called relevance
         - could also be multiple classes (0,1,2,3) as long as
    """
    if option=="sum":
        dcg = sum([y[sorted1[i]]/math.log(i+2, 2) for i in range(k)])
    if option=="np.sum":
        dcg = np.sum([y[sorted1[i]]/math.log(i+2, 2) for i in range(k)])
    elif option=="nolog":
        dcg = sum([y[sorted1[i]] for i in range(k)])
    
    return dcg
    
def score_ndcg(y, y_proba, k=5, option="sum"):
     """ NDCG Normalized Discount Cumulative Gain
     y: labels (0,1)
     y_proba: probabilities to be in class 1

     To do:
     1. sort targets by y_proba and get indexes
     1. sort targets by y and get indexes
     2. compute dcg for top k
     2. compute idealized dcg for top k
     Source: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
     """
     if type(y) is np.ndarray:
         pass
     else:
         raise
            
     sorted1 = np.argsort(-y_proba)
     sorted2 = np.argsort(-y)

     dcg1 = score_dcg(y, sorted1, k)
     dcg2 = score_dcg(y, sorted2, k)

     ndcg = dcg1/dcg2 if dcg2 != 0 else np.nan
     return ndcg
    
def score_ndcg_mean(yLabels, yScores, k=5, discounted=True):
     """ Multi output version of score_ndcg
         - yLabels = the actual labels (0,1)
         - yScores = the predicted scores for label 1 (binding)
     """
     if not yLabels.ndim == 2:
         print("Error")
     ndcg = [score_ndcg(yLabels[i], yScores[i], k) for i in range(yLabels.shape[0])]
     return np.mean(ndcg)


In [72]:
import numpy as np
yLabels = np.random.normal(size=(4000,500))
yLabels=np.where(yLabels>0.1,0,1)
yScores = np.random.normal(size=(4000,500))

In [100]:
%timeit score_ndcg(yLabels[0], yScores[0], k=2, option="nolog")

32.2 µs ± 2.28 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [104]:
%timeit score_ndcg(yLabels[0], yScores[0], k=500, option="sum")
%timeit score_ndcg(yLabels[0], yScores[0], k=500, option="nolog")

%timeit score_ndcg(yScores[0], yScores[0], k=500, option="sum")
%timeit score_ndcg(yLabels[0], yScores[0], k=500, option="np.sum")
%timeit score_ndcg(yScores[0], yScores[0], k=500, option="np.sum")

2.3 ms ± 47.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.51 ms ± 335 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
603 µs ± 78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.39 ms ± 199 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
573 µs ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [98]:
%timeit score_ndcg(yLabels[0], yScores[0], k=500, option="sum")
%timeit score_ndcg(yScores[0], yScores[0], k=500, option="sum")
%timeit score_ndcg(yScores[0], yScores[1], k=500, option="sum")
%timeit score_ndcg(yScores[0], yLabels[0], k=500, option="sum")

2.25 ms ± 78.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
587 ns ± 5.71 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
586 ns ± 12.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
584 ns ± 14.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [85]:
print(yLabels.shape)
print(yLabels.dtype)
print(yScores.shape)
print(yScores.dtype)

(4000, 500)
int64
(4000, 500)
float64


In [88]:
y1=yLabels[0]
y2=yScores[0]
%timeit [y1[sorted1[i]] for i in range(500)]
%timeit [y2[sorted1[i]] for i in range(500)]
%timeit sum([y1[sorted1[i]] for i in range(500)])

105 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
106 µs ± 1.47 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
125 µs ± 1.23 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [94]:
y2.dtype

dtype('float64')

In [71]:
f"{3+2} is 5"

'5 is 5'