<a href="https://colab.research.google.com/github/annadymanus/IR-project/blob/main/model_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy

def normalize_scores(model_predictions):
    for key, item in model_predictions.items():
        docs = [i[0] for i in item]
        scores = [torch.sigmoid(torch.tensor(i[1])) for i in item] #Normalize between 0 and 1 (like it was trained on)
        modified_item = [(docs[i], scores[i].item()) for i in range(len(item))]
        model_predictions[key] = modified_item 

In [2]:
!pip install pickle5
import pickle5 as pickle
import pandas as pd 


gold_standard_path = '/content/drive/Shareddrives/IRProject/validation/2019qrels-docs.txt'
model_predictions_path = '/content/drive/Shareddrives/IRProject/model_predictions/tf_idf_pointwise_preds.pickle'

gold_standard = pd.read_csv(
    gold_standard_path, 
    sep=' ', 
    names=[
        'queryid', 
        'Q0', 
        'docid', 
        'rating',
    ],
)

with open(model_predictions_path, 'rb') as file:
    model_predictions = pickle.load(file)

normalize_scores(model_predictions)

Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 5.3 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


In [3]:
def rank_k_documents(query_results, k=None):
    """
    Rank the results of a query based on its descending model score, 
    remove duplicates, and (optionally) cut at max length k.

    Args:
        query_results (list): List of tuples (docid, score) for a 
            certain queryid.
        k (int): optional cutpoint of the results, where metrics should 
            be evaluated.
    
    Returns:
        list[str]: ranked list of docids, with max length k (if defined).
    """
    ranked_results = sorted(query_results, key=lambda tup: tup[1], reverse=True)
    ranked_docids = [result[0] for result in ranked_results]
    ranked_docids = list(dict.fromkeys(ranked_docids))  # remove duplicates

    if isinstance(k, int):
        if len(ranked_docids) > k:
            ranked_docids = ranked_docids[0:k-1]

    return ranked_docids

# Example: top 10 documents for query 156493
rank_k_documents(model_predictions['156493'], 10)

['D3356946',
 'D685712',
 'D841097',
 'D2186075',
 'D3523176',
 'D1870970',
 'D1146968',
 'D683584',
 'D1407122']

In [4]:
def get_rating_queryid_docid(queryid, docid: str, gold_standard):
    """
    Search for the gold standard rating given for a specific pair of 
    `queryid` and `docid`.

    Args:
        queryid: ID of the query, in either string or integer format.
        docid (str): string with the ID of the document.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
    
    Returns:
        int: Pair's rating (0-3).
    """
    rating = gold_standard[
        (gold_standard['queryid']==int(queryid))
        & (gold_standard['docid']==str(docid))
    ]['rating'].values[0]

    return rating if rating > 0 else 0

# Example: Rating of (queryid=156493, docid=D685712)
get_rating_queryid_docid(156493, 'D685712', gold_standard)

1

In [5]:
def get_list_of_relevances(queryid, model_predictions, gold_standard, k=None):
    """
    Rank the results of a `queryid` and replace `docids` by their relevance 
    rating for the query.

    Args:
        queryid: ID of the query, in either string or integer format.
        model_predictions (dict): Dict of queryids and their lists of 
            documents retrieved by the model. Each key should by the queryid 
            in string format, and its value should be a list of tuples 
            (docid, score), not necessarily ordered.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
        k (int): optional cutpoint of the results, where metrics should 
            be evaluated.
    
    Returns:
        list[int]: list of documents' ratings for the query, ordered by 
            the relevance score given by the model.
    """
    list_of_relevances = []
    for docid in rank_k_documents(model_predictions[str(queryid)], k):
        list_of_relevances.append(
            get_rating_queryid_docid(queryid, docid, gold_standard)
        )

    return list_of_relevances

# Example: actual relevance of each of the first 10 results for queryid 156493
get_list_of_relevances(156493, model_predictions, gold_standard, 10)

[0, 1, 0, 1, 0, 1, 0, 2, 0]

In [6]:
def get_reciprocal_rank(list_of_relevances, relevance_threshold=1):
    """
    Get inverse of the position (reciprocal rank) of the first relevant 
    document in the `list_of_relevances`, based on a relevance threshold.

    Args:
        list_of_relevances (list[int]): list of documents' ratings for 
            the query, ordered by the relevance score given by the model.
        relevance_threshold (int): Miminum rating considered relevant.
    
    Returns:
        float: Reciprocal rank of the list.
    """
    get_reciprocal_rank = 0.0
    for position, relevance in enumerate(list_of_relevances):
        if relevance >= relevance_threshold:
            reciprocal_rank = 1/(position+1.0)
            break

    return reciprocal_rank

# Example 1: RR for queryid 156493, with default relevance threshold:
print(get_reciprocal_rank(
    get_list_of_relevances(156493, model_predictions, gold_standard, 10)
))
# Example 1: RR for the same query, but with relevance threshold = 2:
print(get_reciprocal_rank(
    get_list_of_relevances(156493, model_predictions, gold_standard, 10), 
    2
))

0.5
0.125


In [7]:
import math

def calculate_dcg(list_of_relevances:list):
    """
    Calculate Discounted Cumulative Gain for a given list of relevances.

    Args:
        list_of_relevances (list[int]): list of documents' ratings for 
            the query, ordered by the relevance score given by the model.
    
    Returns:
        float: Discounted Cumulative Gain of the list.
    """
    if isinstance(list_of_relevances, list):
        if len(list_of_relevances)==0:
            return 0
        
        else:
            dcg = []
            for position, relevance in enumerate(list_of_relevances):
                dcg.append(relevance / math.log2(position+2))
        return sum(dcg)

def get_ideal_dcg(queryid, gold_standard):
    """
    Calculate Ideal (Maximal) Discounted Cumulative Gain for a `queryid`, given 
    the gold standard ratings.

    Args:
        queryid: ID of the query, in either string or integer format.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
    
    Returns:
        float: Discounted Cumulative Gain of the list.
    """
    ideal_list_of_relevances = gold_standard[
        gold_standard['queryid']==int(queryid)
    ].sort_values(
        by='rating', 
        ascending=False,
    )['rating'].tolist()

    return calculate_dcg(ideal_list_of_relevances)

# Example: DCG of an imperfect list of ratings divided by ideal DCG of same list
calculate_dcg([0, 2, 4, 0, 1]) / calculate_dcg([4, 2, 1, 0, 0])

0.6332525654008375

In [8]:
def get_query_dcg_rr_at_k(
        queryid, 
        model_predictions, 
        gold_standard, 
        k=100,
        relevance_threshold=1
    ):
    """
    Calculate Reciprocal Rank at k, DCG and nDCG for a certain queryid, 
    by comparing its results with gold_standard.

    Args:
        queryid: Query ID to be evaluated.
        model_predictions (dict): Dict of queryids and their lists of 
            documents retrieved by the model. Each key should by the queryid 
            in string format, and its value should be a list of tuples 
            (docid, score), not necessarily ordered.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
        k (int): cutpoint of the results, where Reciprocal Rank should be 
            evaluated. Defaults to 100.
        relevance_threshold (int): Miminum rating considered relevant for 
            Reciprocal Rank. Defaults to 1.
    
    Returns:
        tuple[float, float, float]: Tuple (reciprocal rank at k, 
            DCG, nDCG) for the given queryid.
    """
    
    # RR
    list_of_k_relevances = get_list_of_relevances(
        queryid, 
        model_predictions, 
        gold_standard, 
        k
    )
    rr = get_reciprocal_rank(
        list_of_k_relevances, 
        relevance_threshold
    )

    # nDCG
    list_of_relevances = get_list_of_relevances(
        queryid, 
        model_predictions, 
        gold_standard, 
    )
    dcg = calculate_dcg(list_of_relevances)
    idcg = get_ideal_dcg(queryid, gold_standard)
    ndcg = dcg / idcg if idcg > 0 else 0

    return rr, dcg, ndcg

# Example: metrics for query 156493, with default k and relevance threshold
get_query_dcg_rr_at_k(156493, model_predictions, gold_standard)

(0.5, 30.248643661123698, 0.7263431232004279)

In [9]:
def get_model_metrics_per_query_at_k(
        model_predictions, 
        gold_standard, 
        k=100,
        relevance_threshold=1
    ):
    """
    Calculate Mean Reciprocal Rank at k, DCG and nDCG for all queryids
    in dict model_predictions.

    Args:
        model_predictions (dict): Dict of queryids and their lists of 
            documents retrieved by the model. Each key should by the queryid 
            in string format, and its value should be a list of tuples 
            (docid, score). Lists don't need to be ordered.
        k (int): cutpoint of the results, where Reciprocal Rank should be 
            evaluated. Defaults to 100.
        relevance_threshold (int): Miminum rating considered relevant for 
            Mean Reciprocal Rank. Defaults to 1.
    
    Returns:
        list[dict]: List of records (queryid, MRR at k, DCG, nDCG) for 
            all queryids in model_predictions.
    """
    query_metrics = []
    for queryid in model_predictions.keys():
        rr, dcg, ndcg = get_query_dcg_rr_at_k(
            queryid, 
            model_predictions, 
            gold_standard, 
            k, 
            relevance_threshold
        )
        query_metrics.append({
            'queryid': queryid,
            f'MRR_at_{k}': rr,
            'DCG': dcg,
            'nDCG': ndcg,
        })
    
    return query_metrics


model_query_metrics = pd.DataFrame(
    get_model_metrics_per_query_at_k(model_predictions, gold_standard)
)

model_query_metrics

Unnamed: 0,queryid,MRR_at_100,DCG,nDCG
0,156493,0.5,30.248644,0.726343
1,1110199,1.0,14.667529,0.70599
2,1063750,0.2,69.527123,0.757474
3,130510,1.0,14.809537,0.694416
4,489204,1.0,59.081328,0.825832
5,573724,1.0,13.979401,0.764868
6,1133167,0.25,50.89287,0.788757
7,527433,0.5,13.937007,0.599917
8,1037798,0.02381,7.344312,0.477287
9,915593,0.142857,35.677808,0.681891


In [10]:
model_query_metrics.drop(columns=['queryid']).describe()

Unnamed: 0,MRR_at_100,DCG,nDCG
count,43.0,43.0,43.0
mean,0.489935,31.800888,0.667206
std,0.367672,29.130604,0.139239
min,0.02381,1.38914,0.223683
25%,0.2,9.848909,0.601788
50%,0.333333,21.898915,0.70599
75%,1.0,39.632215,0.761669
max,1.0,111.936622,0.848162
