<a href="https://colab.research.google.com/github/annadymanus/IR-project/blob/main/model_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of Models

Metrics:
- Mean Reciprocal Rank at k
- (normalized) Discounted Cumulative Gain
- Pairwise Accuracy

## 1. Import libs and read files

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pickle5
import pickle5 as pickle
import pandas as pd 


gold_standard_path = '/content/drive/Shareddrives/IRProject/validation/2019qrels-docs.txt'

gold_standard = pd.read_csv(
    gold_standard_path, 
    sep=' ', 
    names=[
        'queryid', 
        'Q0', 
        'docid', 
        'rating',
    ],
)

models_base_path = '/content/drive/Shareddrives/IRProject/model_predictions'

def format_model_predictions(raw_model_predictions):
    formatted_results = {}
    for queryid in raw_model_predictions.keys():
        formatted_results[queryid] = []
        for docid in raw_model_predictions[queryid].keys():
            score = raw_model_predictions[queryid][docid][0][0]
            formatted_results[queryid].append((docid, score))

    return formatted_results

# Baseline
with open(f'{models_base_path}/cosine_similarity.pickle', 'rb') as file:
    model_cosine_similarity_raw = pickle.load(file)
    model_cosine_similarity = format_model_predictions(model_cosine_similarity_raw)

# Pointwise
with open(f'{models_base_path}/tf_idf_pointwise_preds.pickle', 'rb') as file:
    model_tf_idf = pickle.load(file)

with open(f'{models_base_path}/tf_idf_pointwise_scoring_preds.pickle', 'rb') as file:
    model_tf_idf_scoring = pickle.load(file)

with open(f'{models_base_path}/bart_tokenized_pointwise_preds.pickle', 'rb') as file:
    model_bart_tokenized = pickle.load(file)

with open(f'{models_base_path}/bart_tokenized_word_pointwise_preds.pickle', 'rb') as file:
    model_bart_tokenized_word = pickle.load(file)

with open(f'{models_base_path}/bart_tokenized_pointwise_scoring_preds.pickle', 'rb') as file:
    model_bart_tokenized_scoring = pickle.load(file)

with open(f'{models_base_path}/bart_tokenized_pointwise_word_scoring_preds.pickle', 'rb') as file:
    model_bart_tokenized_word_scoring = pickle.load(file)

with open(f'{models_base_path}/non_cont_word_emb_pointwise_preds.pickle', 'rb') as file:
    model_non_cont_word_emb = pickle.load(file)

with open(f'{models_base_path}/non_cont_word_emb_pointwise_scoring_preds.pickle', 'rb') as file:
    model_non_cont_word_emb_scoring = pickle.load(file)

# Pairwise
with open(f'{models_base_path}/tf_idf_pairwise_preds.pickle', 'rb') as file:
    model_tf_idf_pairwise = pickle.load(file)

with open(f'{models_base_path}/tf_idf_pairwise_scoring_preds.pickle', 'rb') as file:
    model_tf_idf_pairwise_scoring = pickle.load(file)

with open(f'{models_base_path}/bart_tokenized_pairwise_preds.pickle', 'rb') as file:
    model_bart_tokenized_pairwise = pickle.load(file)

with open(f'{models_base_path}/non_cont_word_emb_pairwise_preds.pickle', 'rb') as file:
    model_non_cont_word_emb_pairwise = pickle.load(file)

with open(f'{models_base_path}/non_cont_word_emb_pairwise_scoring_preds.pickle', 'rb') as file:
    model_non_cont_word_emb_pairwise_scoring = pickle.load(file)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 2. Evaluation functions

In [3]:
def rank_k_documents(query_results, k=None):
    """
    Rank the results of a query based on its descending model score, 
    remove duplicates, and (optionally) cut at max length k.

    Args:
        query_results (list): List of tuples (docid, score) for a 
            certain queryid.
        k (int): optional cutpoint of the results, where metrics should 
            be evaluated.
    
    Returns:
        list[str]: ranked list of docids, with max length k (if defined).
    """
    ranked_results = sorted(query_results, key=lambda tup: tup[1], reverse=True)
    ranked_docids = [result[0] for result in ranked_results]
    ranked_docids = list(dict.fromkeys(ranked_docids))  # remove duplicates

    if isinstance(k, int):
        if len(ranked_docids) > k:
            ranked_docids = ranked_docids[0:k]

    return ranked_docids

# Example: top 10 documents for query 156493
rank_k_documents(model_tf_idf['156493'], 10)

['D3356946',
 'D685712',
 'D841097',
 'D2186075',
 'D3523176',
 'D1870970',
 'D1146968',
 'D683584',
 'D1407122',
 'D1604072']

In [4]:
def get_rating_queryid_docid(queryid, docid: str, gold_standard):
    """
    Search for the gold standard rating given for a specific pair of 
    `queryid` and `docid`.

    Args:
        queryid: ID of the query, in either string or integer format.
        docid (str): string with the ID of the document.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
    
    Returns:
        int: Pair's rating (0-3).
    """
    rating = gold_standard[
        (gold_standard['queryid']==int(queryid))
        & (gold_standard['docid']==str(docid))
    ]['rating'].values[0]

    return rating if rating > 0 else 0

# Example: Rating of (queryid=156493, docid=D685712)
get_rating_queryid_docid(156493, 'D685712', gold_standard)

1

In [5]:
def get_list_of_relevances(queryid, model_predictions, gold_standard, k=None):
    """
    Rank the results of a `queryid` and replace `docids` by their relevance 
    rating for the query.

    Args:
        queryid: ID of the query, in either string or integer format.
        model_predictions (dict): Dict of queryids and their lists of 
            documents retrieved by the model. Each key should by the queryid 
            in string format, and its value should be a list of tuples 
            (docid, score), not necessarily ordered.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
        k (int): optional cutpoint of the results, where metrics should 
            be evaluated.
    
    Returns:
        list[int]: list of documents' ratings for the query, ordered by 
            the relevance score given by the model.
    """
    list_of_relevances = []
    for docid in rank_k_documents(model_predictions[str(queryid)], k):
        list_of_relevances.append(
            get_rating_queryid_docid(queryid, docid, gold_standard)
        )

    return list_of_relevances

# Example: actual relevance of each of the first 10 results for queryid 156493
get_list_of_relevances(156493, model_tf_idf, gold_standard, 10)

[0, 1, 0, 1, 0, 1, 0, 2, 0, 0]

In [6]:
def get_reciprocal_rank(list_of_relevances, relevance_threshold=1):
    """
    Get inverse of the position (reciprocal rank) of the first relevant 
    document in the `list_of_relevances`, based on a relevance threshold.

    Args:
        list_of_relevances (list[int]): list of documents' ratings for 
            the query, ordered by the relevance score given by the model.
        relevance_threshold (int): Miminum rating considered relevant.
    
    Returns:
        float: Reciprocal rank of the list.
    """
    reciprocal_rank = 0.0
    for position, relevance in enumerate(list_of_relevances):
        if relevance >= relevance_threshold:
            reciprocal_rank = 1/(position+1.0)
            break

    return reciprocal_rank

# Example 1: RR for queryid 156493, with relevance threshold = 1:
print(get_reciprocal_rank(
    get_list_of_relevances(
        156493, 
        model_tf_idf, 
        gold_standard, 
        10,
    ),
    1,
))
# Example 1: RR for the same query, but with relevance threshold = 2:
print(get_reciprocal_rank(
    get_list_of_relevances(
        156493, 
        model_tf_idf, 
        gold_standard, 
        10,
    ), 
    2,
))

0.5
0.125


In [7]:
import math

def calculate_dcg(list_of_relevances:list):
    """
    Calculate Discounted Cumulative Gain for a given list of relevances.

    Args:
        list_of_relevances (list[int]): list of documents' ratings for 
            the query, ordered by the relevance score given by the model.
    
    Returns:
        float: Discounted Cumulative Gain of the list.
    """
    if isinstance(list_of_relevances, list):
        if len(list_of_relevances)==0:
            return 0
        
        else:
            dcg = []
            for position, relevance in enumerate(list_of_relevances):
                dcg.append(relevance / math.log2(position+2))
        return sum(dcg)

def get_ideal_dcg(queryid, gold_standard):
    """
    Calculate Ideal (Maximal) Discounted Cumulative Gain for a `queryid`, given 
    the gold standard ratings.

    Args:
        queryid: ID of the query, in either string or integer format.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
    
    Returns:
        float: Discounted Cumulative Gain of the list.
    """
    ideal_list_of_relevances = gold_standard[
        gold_standard['queryid']==int(queryid)
    ].sort_values(
        by='rating', 
        ascending=False,
    )['rating'].tolist()

    return calculate_dcg(ideal_list_of_relevances)

# Example: DCG of an imperfect list of ratings divided by ideal DCG of same list
calculate_dcg([0, 2, 4, 0, 1]) / calculate_dcg([4, 2, 1, 0, 0])

0.6332525654008375

In [8]:
def get_pairwise_accuracy(list_of_relevances:list):
    """
    Calculate pairwise accuracy of a given list of relevances.

    Args:
        list_of_relevances (list[int]): list of documents' ratings for 
            the query, ordered by the relevance score given by the model.
    
    Returns:
        float: pairwise accuracy of the list.
    """
    hits_list = []
    miss_list = []
    for position, relevance in enumerate(list_of_relevances):
        hits = len([1 for result in list_of_relevances[position:] if relevance > result])
        miss = len([1 for result in list_of_relevances[position:] if relevance < result])
        hits_list.append(hits)
        miss_list.append(miss)

    overall_hits = sum(hits_list)
    overall_miss = sum(miss_list)

    return overall_hits / (overall_hits + overall_miss)

get_pairwise_accuracy([0, 2, 4, 0, 1])

0.4444444444444444

In [9]:
def get_query_dcg_rr_at_k(
        queryid, 
        model_predictions, 
        gold_standard, 
        k=10,
        relevance_threshold=2
    ):
    """
    Calculate Reciprocal Rank at k, DCG, nDCG and pairwise accuracy for a 
    certain queryid, by comparing its results with gold_standard.

    Args:
        queryid: Query ID to be evaluated.
        model_predictions (dict): Dict of queryids and their lists of 
            documents retrieved by the model. Each key should by the queryid 
            in string format, and its value should be a list of tuples 
            (docid, score), not necessarily ordered.
        gold_standard (pandas.DataFrame): DataFrame with true relevant docids
            for each query.
        k (int): cutpoint of the results, where Reciprocal Rank should be 
            evaluated. Defaults to 10.
        relevance_threshold (int): Miminum rating considered relevant for 
            Reciprocal Rank. Defaults to 2.
    
    Returns:
        tuple[float, float, float, float]: Tuple (reciprocal rank at k, 
            DCG, nDCG, pairwise accuracy) for the given queryid.
    """
    
    # RR
    list_of_k_relevances = get_list_of_relevances(
        queryid, 
        model_predictions, 
        gold_standard, 
        k
    )
    rr = get_reciprocal_rank(
        list_of_k_relevances, 
        relevance_threshold
    )

    # nDCG
    list_of_relevances = get_list_of_relevances(
        queryid, 
        model_predictions, 
        gold_standard, 
    )
    dcg = calculate_dcg(list_of_relevances)
    idcg = get_ideal_dcg(queryid, gold_standard)
    ndcg = dcg / idcg if idcg > 0 else 0

    # Pairwise Accuracy
    pairwise_acc = get_pairwise_accuracy(list_of_relevances)

    return rr, dcg, ndcg, pairwise_acc

# Example: metrics for query 156493, with default k and relevance threshold
get_query_dcg_rr_at_k(156493, model_tf_idf, gold_standard)

(0.125, 30.248643661123698, 0.7263431232004279, 0.5902015487706307)

In [10]:
def get_model_metrics_per_query_at_k(
        model_predictions, 
        gold_standard, 
        k=10,
        relevance_threshold=2
    ):
    """
    Calculate Mean Reciprocal Rank at k, DCG, nDCG and pairwise accuracy for 
    all queryids in dict model_predictions.

    Args:
        model_predictions (dict): Dict of queryids and their lists of 
            documents retrieved by the model. Each key should by the queryid 
            in string format, and its value should be a list of tuples 
            (docid, score). Lists don't need to be ordered.
        k (int): cutpoint of the results, where Reciprocal Rank should be 
            evaluated. Defaults to 10.
        relevance_threshold (int): Miminum rating considered relevant for 
            Mean Reciprocal Rank. Defaults to 2.
    
    Returns:
        list[dict]: List of records (queryid, MRR at k, DCG, nDCG, 
            pairwise accuracy) for all queryids in model_predictions.
    """
    query_metrics = []
    for queryid in model_predictions.keys():
        rr, dcg, ndcg, pairwise_acc = get_query_dcg_rr_at_k(
            queryid, 
            model_predictions, 
            gold_standard, 
            k, 
            relevance_threshold
        )
        query_metrics.append({
            'queryid': queryid,
            f'MRR_at_{k}': rr,
            'DCG': dcg,
            'nDCG': ndcg,
            'pairwise_acc': pairwise_acc,
        })
    
    return query_metrics


pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_tf_idf, 
        gold_standard,
    )
)

Unnamed: 0,queryid,MRR_at_10,DCG,nDCG,pairwise_acc
0,156493,0.125,30.248644,0.726343,0.590202
1,1110199,0.25,14.667529,0.70599,0.748724
2,1063750,0.0,69.527123,0.757474,0.502124
3,130510,1.0,14.809537,0.694416,0.55041
4,489204,0.166667,59.081328,0.825832,0.531118
5,573724,1.0,13.979401,0.764868,0.691674
6,1133167,0.25,50.89287,0.788757,0.628537
7,527433,0.1,13.937007,0.599917,0.565527
8,1037798,0.0,7.344312,0.477287,0.4016
9,915593,0.142857,35.677808,0.681891,0.50135


## 3. Evaluate each model

### Baseline

In [11]:
metrics_model_cosine_similarity = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_cosine_similarity, 
        gold_standard,
    )
)

metrics_model_cosine_similarity.drop(columns=['queryid']).mean()

MRR_at_10        0.253387
DCG             31.337178
nDCG             0.649092
pairwise_acc     0.581657
dtype: float64

### Pointwise

In [12]:
metrics_model_tf_idf = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_tf_idf, 
        gold_standard,
    )
)

metrics_model_tf_idf.drop(columns=['queryid']).mean()

MRR_at_10        0.251800
DCG             31.800888
nDCG             0.667206
pairwise_acc     0.592718
dtype: float64

In [13]:
metrics_model_tf_idf_scoring = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_tf_idf_scoring, 
        gold_standard,
    )
)

metrics_model_tf_idf_scoring.drop(columns=['queryid']).mean()

MRR_at_10        0.258869
DCG             31.722231
nDCG             0.655401
pairwise_acc     0.587863
dtype: float64

In [14]:
metrics_model_bart_tokenized = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_bart_tokenized, 
        gold_standard,
    )
)

metrics_model_bart_tokenized.drop(columns=['queryid']).mean()

MRR_at_10        0.210084
DCG             33.681145
nDCG             0.650810
pairwise_acc     0.549932
dtype: float64

In [15]:
metrics_model_bart_tokenized_scoring = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_bart_tokenized_scoring, 
        gold_standard,
    )
)

metrics_model_bart_tokenized_scoring.drop(columns=['queryid']).mean()

MRR_at_10        0.231349
DCG             36.038576
nDCG             0.664488
pairwise_acc     0.574078
dtype: float64

In [16]:
metrics_model_bart_tokenized_word = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_bart_tokenized_word, 
        gold_standard,
    )
)

metrics_model_bart_tokenized_word.drop(columns=['queryid']).mean()

MRR_at_10        0.191528
DCG             31.208557
nDCG             0.633229
pairwise_acc     0.556830
dtype: float64

In [17]:
metrics_model_bart_tokenized_word_scoring = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_bart_tokenized_word_scoring, 
        gold_standard,
    )
)

metrics_model_bart_tokenized_word_scoring.drop(columns=['queryid']).mean()

MRR_at_10        0.280131
DCG             31.333809
nDCG             0.643832
pairwise_acc     0.567777
dtype: float64

In [18]:
metrics_model_non_cont_word_emb = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_non_cont_word_emb, 
        gold_standard,
    )
)

metrics_model_non_cont_word_emb.drop(columns=['queryid']).mean()

MRR_at_10        0.380159
DCG             32.664117
nDCG             0.676987
pairwise_acc     0.607968
dtype: float64

In [19]:
metrics_model_non_cont_word_emb_scoring = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_non_cont_word_emb_scoring, 
        gold_standard,
    )
)

metrics_model_non_cont_word_emb_scoring.drop(columns=['queryid']).mean()

MRR_at_10        0.303027
DCG             32.126986
nDCG             0.663315
pairwise_acc     0.594484
dtype: float64

### Pairwise

In [20]:
metrics_model_tf_idf_pairwise = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_tf_idf_pairwise, 
        gold_standard,
    )
)

metrics_model_tf_idf_pairwise.drop(columns=['queryid']).mean()

MRR_at_10        0.252473
DCG             31.706860
nDCG             0.666408
pairwise_acc     0.583277
dtype: float64

In [21]:
metrics_model_tf_idf_pairwise_scoring = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_tf_idf_pairwise_scoring, 
        gold_standard,
    )
)

metrics_model_tf_idf_pairwise_scoring.drop(columns=['queryid']).mean()

MRR_at_10        0.237495
DCG             31.595256
nDCG             0.651772
pairwise_acc     0.583725
dtype: float64

In [22]:
metrics_model_bart_tokenized_pairwise = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_bart_tokenized_pairwise, 
        gold_standard,
    )
)

metrics_model_bart_tokenized_pairwise.drop(columns=['queryid']).mean()

MRR_at_10        0.203092
DCG             31.146306
nDCG             0.646021
pairwise_acc     0.555442
dtype: float64

In [23]:
metrics_model_non_cont_word_emb_pairwise = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_non_cont_word_emb_pairwise, 
        gold_standard,
    )
)

metrics_model_non_cont_word_emb_pairwise.drop(columns=['queryid']).mean()

MRR_at_10        0.318217
DCG             32.100947
nDCG             0.665090
pairwise_acc     0.587289
dtype: float64

In [24]:
metrics_model_non_cont_word_emb_pairwise_scoring = pd.DataFrame(
    get_model_metrics_per_query_at_k(
        model_non_cont_word_emb_pairwise_scoring, 
        gold_standard,
    )
)

metrics_model_non_cont_word_emb_pairwise_scoring.drop(columns=['queryid']).mean()

MRR_at_10        0.331294
DCG             32.295990
nDCG             0.668845
pairwise_acc     0.595777
dtype: float64

## 4. Compare all models

In [25]:
pd.concat(
    [
        # Baseline
        metrics_model_cosine_similarity.drop(columns=['queryid']).mean(),
        # Pointwise
        metrics_model_tf_idf.drop(columns=['queryid']).mean(),
        metrics_model_tf_idf_scoring.drop(columns=['queryid']).mean(),
        metrics_model_bart_tokenized.drop(columns=['queryid']).mean(),
        metrics_model_bart_tokenized_scoring.drop(columns=['queryid']).mean(),
        metrics_model_bart_tokenized_word.drop(columns=['queryid']).mean(),
        metrics_model_bart_tokenized_word_scoring.drop(columns=['queryid']).mean(),
        metrics_model_non_cont_word_emb.drop(columns=['queryid']).mean(),
        metrics_model_non_cont_word_emb_scoring.drop(columns=['queryid']).mean(),
        # Pairwise
        metrics_model_tf_idf_pairwise.drop(columns=['queryid']).mean(),
        metrics_model_tf_idf_pairwise_scoring.drop(columns=['queryid']).mean(),
        metrics_model_bart_tokenized_pairwise.drop(columns=['queryid']).mean(),
        metrics_model_non_cont_word_emb_pairwise.drop(columns=['queryid']).mean(),
        metrics_model_non_cont_word_emb_pairwise_scoring.drop(columns=['queryid']).mean(),
    ], 
    axis=1,
    names=['a', 'b', 'c', 'd', 'e', 'f']
).T.rename(index={
    0: 'cosine_similarity',
    1: 'tf_idf',
    2: 'tf_idf_scoring',
    3: 'bart_tokenized',
    4: 'bart_tokenized_scoring',
    5: 'bart_tokenized_word',
    6: 'bart_tokenized_word_scoring',
    7: 'non_cont_word_emb',
    8: 'non_cont_word_emb_scoring',
    9: 'tf_idf_pairwise',
    10: 'tf_idf_pairwise_scoring',
    11: 'bart_tokenized_pairwise',
    12: 'non_cont_word_emb_pairwise',
    13: 'non_cont_word_emb_pairwise_scoring',
}).sort_values(by='nDCG', ascending=False)

Unnamed: 0,MRR_at_10,DCG,nDCG,pairwise_acc
non_cont_word_emb,0.380159,32.664117,0.676987,0.607968
non_cont_word_emb_pairwise_scoring,0.331294,32.29599,0.668845,0.595777
tf_idf,0.2518,31.800888,0.667206,0.592718
tf_idf_pairwise,0.252473,31.70686,0.666408,0.583277
non_cont_word_emb_pairwise,0.318217,32.100947,0.66509,0.587289
bart_tokenized_scoring,0.231349,36.038576,0.664488,0.574078
non_cont_word_emb_scoring,0.303027,32.126986,0.663315,0.594484
tf_idf_scoring,0.258869,31.722231,0.655401,0.587863
tf_idf_pairwise_scoring,0.237495,31.595256,0.651772,0.583725
bart_tokenized,0.210084,33.681145,0.65081,0.549932


## 5. Investigate easy and hard queries

### Hard queries, according to baseline

In [26]:
queries_df = pd.read_csv(
    '/content/drive/Shareddrives/IRProject/msmarco-test2019-queries.tsv', 
    sep='\t',
    names=[
        'queryid', 
        'query', 
    ],
)

queries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   queryid  200 non-null    int64 
 1   query    200 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.2+ KB


In [27]:
metrics_model_cosine_similarity.sort_values(by='nDCG').head(3)

Unnamed: 0,queryid,MRR_at_10,DCG,nDCG,pairwise_acc
34,855410,0.0,1.206046,0.1942,0.504802
28,287683,0.0,1.238491,0.260086,0.81383
17,1115776,0.0,3.360276,0.357213,0.659495


In [28]:
metrics_model_non_cont_word_emb.sort_values(by='nDCG').head(3)

Unnamed: 0,queryid,MRR_at_10,DCG,nDCG,pairwise_acc
34,855410,0.0,1.590312,0.256076,0.746098
28,287683,0.0,1.246615,0.261792,0.810284
17,1115776,0.0,3.483412,0.370303,0.689429


In [29]:
hard_queries = [855410, 287683, 1115776]

queries_df[queries_df['queryid'].isin(hard_queries)]

Unnamed: 0,queryid,query
88,1115776,what is an aml surveillance analyst
130,287683,how many liberty ships were built in brunswick
155,855410,what is theraderm used for


### Easy queries, according to baseline

In [30]:
metrics_model_cosine_similarity.sort_values(by='nDCG').tail(3)

Unnamed: 0,queryid,MRR_at_10,DCG,nDCG,pairwise_acc
37,47923,1.0,109.153624,0.81724,0.564757
4,489204,0.5,58.470244,0.81729,0.532042
24,1114819,1.0,111.263923,0.843065,0.582652


In [31]:
queries_df[queries_df['queryid'].isin([47923, 489204, 1114819])]

Unnamed: 0,queryid,query
42,489204,right pelvic pain causes
115,1114819,what is durable medical equipment consist of
160,47923,axon terminals or synaptic knob definition


In [32]:
metrics_model_non_cont_word_emb.sort_values(by='nDCG').tail(3)

Unnamed: 0,queryid,MRR_at_10,DCG,nDCG,pairwise_acc
25,183378,0.5,114.099019,0.859406,0.582218
26,1106007,0.5,44.45944,0.867105,0.685044
24,1114819,1.0,118.229078,0.895841,0.618744


In [33]:
queries_df[queries_df['queryid'].isin([183378, 1106007, 1114819])]

Unnamed: 0,queryid,query
115,1114819,what is durable medical equipment consist of
120,183378,exons definition biology
125,1106007,define visceral?
