In [144]:
import numpy as np
from scipy.stats import chi2, norm, expon

### Implement Ranking Metrics
#### Normalized Discounted Cumulative Gain
##### Algorithm: 
1) Calculate DCG (Discounted Cumulative Gain) at\( k \):

$$ \text{DCG}_k = \sum_{i=1}^{k} \frac{rel_i}{\log_2(i + 1)} $$

2) Calculate ideal DCG (IDCG) — This is the DCG calculated for the ideal (descending sorted) relevance list:

$$ \text{IDCG}_k = \sum_{i=1}^{k} \frac{rel_i^{*}}{\log_2(i + 1)} $$

3) Calculate normalized DCG (NDCG):

$$ \text{NDCG}_k = \frac{\text{DCG}_k}{\text{IDCG}_k} $$

#### F_beta score

1) Compute precision at top k on formula

$$
\text{Precision@k} = \frac{\text{number of recommended relevant items among top k}}{\text{number of recommended items k}}
$$

2) Compute recall at top k on formula
$$
\text{Recall@k} = \frac{\text{number of recommended relevant items among top k}}{\text{number of all relevant items in the system}}
$$

3) Calculate F_beta score at top k on formula:

$$ F_\beta = (1 + \beta^2) \cdot \frac{\text{Precision@k} \cdot \text{Recall@k}}{(\beta^2 \cdot \text{Precision@k}) + \text{Recall@k}} $$


#### Reciprocal Rank

1) Find position of first relevant result
2) Calculate
$$ \text{RR} = \frac{1}{\text{first relevan result}}$$

In [145]:
class Ranking:
    def __init__(self, search_result):
        self.result = search_result
        
    def _discounted_cumulative_gain(self, result, k):
        result =  np.asarray(result)[:k]
        denom = np.log2(np.arange(2, k + 2))
        return np.sum(result / denom)
    
    def normalized_discounted_cumulative_gain(self, k):
        ideal_result = sorted(self.result, reverse=True)
        dsg = self._discounted_cumulative_gain(self.result, k)
        idsg = self._discounted_cumulative_gain(ideal_result, k)
        if (idsg == 0):
            return 0.0
        return dsg / idsg
    
    def _precision(self, k):
        return sum(self.result[:k]) / k
    
    def _recall(self, k):
        return sum(self.result[:k]) / sum(self.result)
    
    def F_beta_score(self, k, beta):
        precision = self._precision(k)
        recall = self._recall(k)
        if precision + recall == 0:
            return 0.0
        score = ((1 + beta ** 2) * precision * recall ) / ((beta ** 2 * precision) + recall) 
        return score
    
    def reciprocal_rank(self):
        result = np.array(self.result)
        index = np.where(result == 1)[0]
        return 1 / (index[0] + 1) 


### Function which generate search results using one of random distributions.
- Chi-Squared distribution
- Normal distribution
- Exponential distribution
### Steps:
1) Generate continuous values ​​(float) from a distribution.
2) Binarize them using a threshold. (
The top 30% will be considered relevant.)

In [146]:
def Generate_search_result(size, distribution="chi2"):
    match distribution:
        case "chi2":
            chi = chi2.rvs(df=2, size=size)
            threshold = np.percentile(chi, 75)
            return (chi > threshold).astype(int)
        
        case "norm":
            normal =  norm(loc = 0, scale=1.0).rvs(size=size)  
            threshold = np.percentile(normal, 75)
            return (normal > threshold).astype(int)
        
        case "exponential":
            exp = expon(scale=1.0).rvs(size=size)  
            threshold = np.percentile(exp, 75)
            return (exp > threshold).astype(int)


### Evaluate ranking metrics for search results generated from 3 different distributions.

In [169]:
chi_distribution = Generate_search_result(30, distribution="chi2")
norm_distribution = Generate_search_result(30, distribution="norm")
expon_distribution = Generate_search_result(30, distribution="exponential")

chi_ranking = Ranking(chi_distribution)
norm_ranking = Ranking(norm_distribution)
expon_ranking = Ranking(expon_distribution)

print("Normalized_discounted_cumulative_gain at 10 for 3 different distribution:")
print("Chi2 distribution search result:", chi_ranking.normalized_discounted_cumulative_gain(10))
print("Normal distriburion search result:", norm_ranking.normalized_discounted_cumulative_gain(10))
print("Exponential distribution search result:", expon_ranking.normalized_discounted_cumulative_gain(10))
print()
print("F_beta score at 10 for 3 different distribution:")
print("Chi2 distribution search result:", chi_ranking.F_beta_score(10, 0.5))
print("Normal distriburion search result:", norm_ranking.F_beta_score(10, 0.5))
print("Exponential distribution search result:", expon_ranking.F_beta_score(10, 0.5))
print()
print("Reciprocal Rank for 3 different distribution:")
print("Chi2 distribution search result:", chi_ranking.reciprocal_rank())
print("Normal distriburion search result:", norm_ranking.reciprocal_rank())
print("Exponential distribution search result:", expon_ranking.reciprocal_rank())

Normalized_discounted_cumulative_gain at 10 for 3 different distribution:
Chi2 distribution search result: 0.14926017916129347
Normal distriburion search result: 0.15291137700635812
Exponential distribution search result: 0.24753108135216337

F_beta score at 10 for 3 different distribution:
Chi2 distribution search result: 0.20833333333333334
Normal distriburion search result: 0.20833333333333334
Exponential distribution search result: 0.3125

Reciprocal Rank for 3 different distribution:
Chi2 distribution search result: 0.1111111111111111
Normal distriburion search result: 0.125
Exponential distribution search result: 0.16666666666666666
