# **evaluation**

**Mean Average Precision(MAP)**

In [2]:
def calculate_map(sorted_documents, releation):
    total_precision = 0
    num_queries = 0

    for query_id, documents in sorted_documents.items():
        # Get the list of positive documents
        positive_list = releation.loc[releation['query_id'] == query_id, 'doc_id'].tolist()

        precision_sum = 0
        num_positives = len(positive_list)

        for i, doc_id in enumerate(documents):
            if doc_id in positive_list:
                precision_sum += precision_at_k(documents[:i+1], positive_list, i+1)

        if num_positives > 0:
            total_precision += precision_sum / num_positives
            num_queries += 1

    return total_precision / num_queries




**precision**

In [3]:
def precision_at_k(ranked_list, positive_list, k):

    if len(ranked_list) > k:
        ranked_list = ranked_list[:k]

    precision = 0
    correct = 0

    for i, p in enumerate(ranked_list):
        if p in positive_list and p not in ranked_list[:i]:
            correct += 1
            precision += correct / (i + 1)

    if not ranked_list:
        return 0

    return precision / len(ranked_list)

In [4]:
def calculate_precision_at_k(sorted_documents, releation, k):
    total_precision = 0
    num_queries = 0

    for query_id, documents in sorted_documents.items():
        # Get the list of positive documents
        positive_list = releation.loc[releation['query_id'] == query_id, 'doc_id'].tolist()

        precision = precision_at_k(documents, positive_list, k)

        total_precision += precision
        num_queries += 1

    return total_precision / num_queries



**mean reciprocal rank**

In [5]:
def calculate_mrr(sorted_documents, releation):
    total_reciprocal_rank = 0
    num_queries = 0

    for query_id, documents in sorted_documents.items():
        # Get the list of positive documents
        positive_list = releation.loc[releation['query_id'] == query_id, 'doc_id'].tolist()

        reciprocal_rank = 0

        for i, doc_id in enumerate(documents):
            if doc_id in positive_list:
                reciprocal_rank = 1 / (i + 1)
                break

        if reciprocal_rank > 0:
            total_reciprocal_rank += reciprocal_rank
            num_queries += 1

    return total_reciprocal_rank / num_queries

# **Retrieving documents using neural semantic vectors**

In [6]:
import pandas as pd


documents = pd.read_csv("hw1_docs.csv")

queries = pd.read_csv("hw1_queries.csv")

releation = pd.read_csv("hw1_qrels.csv")


display(documents)
display(queries)
display(releation)

Unnamed: 0,doc_id,document
0,2p7qrgx0,"Since 2007, many cases of fever, thrombocytope..."
1,25dcnext,BACKGROUND: Respiratory viral (RV) outbreaks i...
2,2jq626ye,A novel coronavirus (2019-nCoV) originating in...
3,270msv5l,• Several studies suggested Baricitinib as a p...
4,14x4uqq7,Evidence from the 2003 SARS epidemic and 2009 ...
...,...,...
745,1ebkagvv,OBJECTIVE: To retrospectively analyze the ches...
746,80dfqjql,Summary The novel human coronavirus SARS-CoV-2...
747,0fzwwluc,Objectives: Patients with novel coronavirus di...
748,105q161g,"A number of virological, epidemiological and e..."


Unnamed: 0,query_id,query
0,1,what is the origin of COVID-19
1,2,how does the coronavirus respond to changes in...
2,3,will SARS-CoV2 infected people develop immunit...
3,4,what causes death from Covid-19?
4,5,what drugs have been active against SARS-CoV o...
5,6,what types of rapid testing for Covid-19 have ...
6,7,are there serological tests that detect antibo...
7,8,how has lack of testing availability led to un...
8,9,how has COVID-19 affected Canada
9,10,has social distancing had an impact on slowing...


Unnamed: 0,query_id,doc_id
0,1,005b2j4b
1,1,0chuwvg6
2,1,0t2a5500
3,1,0y34yxlb
4,1,105q161g
...,...,...
745,50,xhm97wy2
746,50,xieqswct
747,50,y87tq9wu
748,50,ygwdldae


In [7]:
import pandas as pd
import numpy as np
import re
from gensim.utils import simple_preprocess

from gensim.parsing.preprocessing import STOPWORDS

def preprocess(text):
   # Convert text to lowercase
   text = text.lower()
   # Remove special characters
   text = re.sub(r'\W', ' ', text)
   text = re.sub(r'\s+', ' ', text)
   # Tokenize the text
   words = simple_preprocess(text, deacc=True)

   # Remove stop words
   words = [word for word in words if word not in STOPWORDS]


   return words



In [8]:
queries['preprocessed_text'] = queries['query'].apply(preprocess)
documents['preprocessed_text'] = documents['document'].apply(preprocess)


In [9]:
display(documents)
display(queries)
display(releation)

Unnamed: 0,doc_id,document,preprocessed_text
0,2p7qrgx0,"Since 2007, many cases of fever, thrombocytope...","[cases, fever, leukopenia, syndrome, ftls, eme..."
1,25dcnext,BACKGROUND: Respiratory viral (RV) outbreaks i...,"[background, respiratory, viral, rv, outbreaks..."
2,2jq626ye,A novel coronavirus (2019-nCoV) originating in...,"[novel, coronavirus, ncov, originating, wuhan,..."
3,270msv5l,• Several studies suggested Baricitinib as a p...,"[studies, suggested, baricitinib, potential, d..."
4,14x4uqq7,Evidence from the 2003 SARS epidemic and 2009 ...,"[evidence, sars, epidemic, pandemic, shows, fa..."
...,...,...,...
745,1ebkagvv,OBJECTIVE: To retrospectively analyze the ches...,"[objective, retrospectively, analyze, chest, c..."
746,80dfqjql,Summary The novel human coronavirus SARS-CoV-2...,"[summary, novel, human, coronavirus, sars, cov..."
747,0fzwwluc,Objectives: Patients with novel coronavirus di...,"[objectives, patients, novel, coronavirus, dis..."
748,105q161g,"A number of virological, epidemiological and e...","[number, virological, epidemiological, ethnogr..."


Unnamed: 0,query_id,query,preprocessed_text
0,1,what is the origin of COVID-19,"[origin, covid]"
1,2,how does the coronavirus respond to changes in...,"[coronavirus, respond, changes, weather]"
2,3,will SARS-CoV2 infected people develop immunit...,"[sars, cov, infected, people, develop, immunit..."
3,4,what causes death from Covid-19?,"[causes, death, covid]"
4,5,what drugs have been active against SARS-CoV o...,"[drugs, active, sars, cov, sars, cov, animal, ..."
5,6,what types of rapid testing for Covid-19 have ...,"[types, rapid, testing, covid, developed]"
6,7,are there serological tests that detect antibo...,"[serological, tests, detect, antibodies, coron..."
7,8,how has lack of testing availability led to un...,"[lack, testing, availability, led, underreport..."
8,9,how has COVID-19 affected Canada,"[covid, affected, canada]"
9,10,has social distancing had an impact on slowing...,"[social, distancing, impact, slowing, spread, ..."


Unnamed: 0,query_id,doc_id
0,1,005b2j4b
1,1,0chuwvg6
2,1,0t2a5500
3,1,0y34yxlb
4,1,105q161g
...,...,...
745,50,xhm97wy2
746,50,xieqswct
747,50,y87tq9wu
748,50,ygwdldae


# **word embedding**

# make vectors with arithmetic mean

In [10]:
from gensim.models import Word2Vec
from gensim import matutils

# Train Word2Vec model with arithmetic mean
model = Word2Vec(documents['preprocessed_text'],window= 120 , vector_size=200, min_count=1 ,sg=1)

 # Create document vectors with arithmetic mean
doc_vectors = {}
for index, doc in documents['preprocessed_text'].items():
 doc_vector = np.mean([model.wv[word] for word in doc if word in model.wv.key_to_index], axis=0)
 doc_vectors[documents.loc[index, 'doc_id']] = doc_vector # Use document_id as key


# Create query vectors
query_vectors = {}
for index, query in queries['preprocessed_text'].items():
 query_vector = np.mean([model.wv[word] for word in query if word in model.wv.key_to_index], axis=0)
 query_vectors[queries.loc[index, 'query_id']] = query_vector # Use query_id as key



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [11]:
def cosine_similarity(vec1, vec2):
   return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [12]:
related_docs = {}
top_docs ={}
for query, query_vector in query_vectors.items():
 # Calculate cosine similarity between the query vector and each document vector
 similarities = {}
 for doc in doc_vectors.keys():
     # Check if the document vector is not NaN
     if not np.isnan(doc_vectors[doc]).any():
         similarity = cosine_similarity(query_vector, doc_vectors[doc])
         similarities[doc] = similarity

 # Sort the document vectors based on their cosine similarity to the query vector
 sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

 # top 10 most similar docs

 related_docs[query] = sorted_similarities[:10]
top_docs = {query: [doc_tuple[0] for doc_tuple in doc_list] for query, doc_list in related_docs.items()}

In [13]:
print(sorted_similarities)
print(related_docs)

[('1v0f2dtx', 0.91180503), ('q77da2y3', 0.8632516), ('1ir19s25', 0.863077), ('1q71gjwt', 0.8509954), ('2jeb1vcs', 0.84994215), ('2nruf2g7', 0.8481092), ('oiu80002', 0.84278446), ('1c1k0p93', 0.8368475), ('2uvibr2j', 0.8364763), ('4ywrzyse', 0.8358617), ('ygwdldae', 0.83410907), ('1mmqfp7g', 0.832257), ('041cf99j', 0.83225685), ('xeq0dq6u', 0.8297192), ('xieqswct', 0.8278594), ('02q9y011', 0.82785815), ('18fbtlfg', 0.82699937), ('v0m90h3n', 0.82581025), ('0a8sz7zb', 0.8196899), ('2kwfcgz9', 0.8195567), ('2um1w0g2', 0.8154698), ('0o05oskr', 0.8150851), ('0mructd7', 0.8141559), ('0slywdik', 0.8119192), ('0fitbwuv', 0.81051606), ('00z7x46i', 0.8089236), ('094lgjnn', 0.80745465), ('273ppceg', 0.8048625), ('13tc3loo', 0.80440956), ('10qpje4d', 0.80440897), ('19h2i631', 0.8037465), ('ykzsoafe', 0.8016766), ('076qek8o', 0.8015322), ('18yzwa2g', 0.8009664), ('1iio41wn', 0.799008), ('1no1zaty', 0.7976393), ('0i5dcbzz', 0.7964604), ('12o2r9zx', 0.79566383), ('0gier0lu', 0.79490715), ('1cc9ig04', 

In [14]:
print(top_docs)

{1: ['cccqcfgq', '0m5mc320', 'vk8s1f23', '0xkz36bj', '0cq5ee1i', '22fc1qly', 'lj8t52yl', '2y452utz', '0v1appqr', '0t2a5500'], 2: ['1llox90t', '04rbtmmi', '1bxt21za', '0vlzwksu', '03s9spbi', '1k3d3o2q', '0fbmelx0', '28sgnyh1', '0oma7hdu', '0rdq6g0b'], 3: ['2d04geu3', '01q4pu9k', '01mo6yo9', '4e2j89uy', '0mructd7', 'xeq0dq6u', 'kf7yz3oz', 'u35rryzi', '2jeb1vcs', '3zmq7nd5'], 4: ['1box6noa', '4ishl6bj', '4bio5jp5', '1k3j6f79', '0a624nez', '223v2obv', '0376d6vf', '9nbj3ckb', '01es0zv4', '0o92zije'], 5: ['1cc9ig04', '1mmqfp7g', '0wh7x410', '1kzy7rts', '043w3zgy', '1ge2dydz', '02n30zc5', '1mjaycee', '1no1zaty', '2cvvkrx9'], 6: ['0oak9ggm', '0w7tq79d', '0uvzy48c', '05zmldvj', '1a6d8urj', '04pp0o74', '1cew6vn5', '73b2rcn1', '1dbeh8q7', '59prqbb3'], 7: ['1dbeh8q7', '0w7tq79d', '0k5j5h7p', '0beno5o5', '07qsm5pv', '1s0exznp', '17q4g88y', '1huoe4dp', '0tdfvlqd', '0hie9nw7'], 8: ['223v2obv', '0l4pec0z', '0k6r5q1t', '17oac3bg', '0gikppdh', '1b3pigtl', '1jf2zz5q', '0jv5mnnl', '07v9qign', '1109fcvc'],

## evaluate word2vec (arithmetic mean)

**print 10 best documents for a random query (arithmetic mean word2vec model)**

In [15]:
import random
# Select a random query
random_query_id = random.choice(list(top_docs.keys()))


original_query = queries.loc[queries['query_id'] == random_query_id, 'query'].values[0]

# Get the top 10 documents
top_10_documents = top_docs[random_query_id][:10]

print(f"For the query '{original_query}', the top 10 documents are: \n")
for doc_id in top_10_documents:
    document = documents.loc[documents['doc_id'] == doc_id]
    print(f"Document: {document['document'].values[0]}")


For the query 'Does SARS-CoV-2 have any subtypes, and if so what are they?', the top 10 documents are: 

Document: The World Health Organization characterized the COVID-19 as a pandemic in March 2020, the second pandemic of the 21st century. Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) is a positive-stranded RNA betacoronavirus of the family Coronaviridae. Expanding virus populations, as that of SARS-CoV-2, accumulate a number of narrowly shared polymorphisms imposing a confounding effect on traditional clustering methods. In this context, approaches that reduce the complexity of the sequence space occupied by the SARS-CoV-2 population are necessary for a robust clustering. Here, we proposed the subdivision of the global SARS-CoV-2 population into sixteen well-defined subtypes by focusing on the widely shared polymorphisms in nonstructural (nsp3, nsp4, nsp6, nsp12, nsp13 and nsp14) cistrons, structural (spike and nucleocapsid) and accessory (ORF8) genes. Six virus subty

In [16]:
p_at_5_average_mean = calculate_precision_at_k(top_docs, releation, 5)

p_at_10_average_mean  = calculate_precision_at_k(top_docs, releation, 10)

map_average_mean = calculate_map(top_docs, releation)

mrr_average_mean = calculate_mrr(top_docs, releation)


print(f"P@5 with average mean word2vec model: {p_at_5_average_mean}")
print(f"P@10 with average mean word2vec model: {p_at_10_average_mean}")
print(f"Mean Average Precision (MAP) with average mean word2vec model: {map_average_mean}")
print(f"Mean Reciprocal Rank (MRR) with average mean word2vec model: {mrr_average_mean}")


P@5 with average mean word2vec model: 0.5288
P@10 with average mean word2vec model: 0.4111174603174603
Mean Average Precision (MAP) with average mean word2vec model: 0.246592427143697
Mean Reciprocal Rank (MRR) with average mean word2vec model: 0.8258928571428571


# **make vectors with weighted mean**




## Calculate Term Frequency (documents)

In [35]:
def calculate_term_frequncy(term, words):

    term_count = words.count(term)
 #   term_count = term_count + 1
    return term_count


In [36]:
import math

term_frequency = {}

# Iterate over the preprocessed documents
for i, row in documents.iterrows():
    # Get the preprocessed text for the current document
    preprocessed_words = row['preprocessed_text']
    # Calculate the term frequency for each word
    for word in preprocessed_words:
        if word not in term_frequency:
            term_frequency[word] = {}
        term_frequency[word][i] = calculate_term_frequncy(word, preprocessed_words)



In [37]:
print(term_frequency)
print(len(term_frequency))

9489


## Calculate IDF (documents)

In [38]:


def compute_idf( documents , term_frequency ):
    N = len(documents)
    idf = {}
    for word , related_doc in term_frequency.items() :
      idf[word] = math.log10 (N / len(related_doc) )


    return idf


In [39]:
# Compute IDF
idf = compute_idf(documents , term_frequency)
print(idf)

# Sort words by frequency
sorted_idf = sorted(idf.items(), key=lambda x: x[1], reverse=True)


top_words = [word for word, freq in sorted_idf[:9489]]
print(top_words)



## calculate tf-idf for documents

In [40]:
def calculate_tfidf(term_frequency, idf, documents, terms):
   # Create a DataFrame with the word names as the column names
   tfidf = pd.DataFrame(np.zeros((len(documents), len(terms))), columns=terms)

   for word, frequency in term_frequency.items():
       if word in tfidf.columns:
           for doc, freq in frequency.items():
               tfidf.loc[doc, word] = freq * idf[word]

   return tfidf


In [41]:
documents_list = documents['document'].tolist()

tfidf = calculate_tfidf(term_frequency, idf, documents_list, top_words)


In [42]:
display(tfidf)

Unnamed: 0,leukopenia,ftls,henan,tick,bites,granulocytic,anaplasmosis,hga,blastx,deduced,...,respiratory,pandemic,results,infection,patients,cov,disease,sars,coronavirus,covid
0,2.875061,17.250368,8.625184,5.750123,2.875061,2.875061,2.875061,2.875061,2.875061,2.875061,...,0.000000,0.000000,0.846550,0.825327,0.801690,0.000000,0.00000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.920176,0.000000,0.423275,0.412663,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.920176,0.431016,0.000000,1.237990,1.603380,0.000000,0.00000,0.000000,0.282885,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,1.237990,0.801690,0.000000,0.00000,0.000000,0.000000,0.516824
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.431016,0.423275,0.000000,0.000000,0.000000,0.00000,0.296422,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.846550,0.000000,6.012675,0.000000,0.30103,0.000000,0.282885,0.904442
746,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.380264,0.000000,0.000000,0.000000,0.000000,0.308035,0.00000,0.592844,1.131538,0.000000
747,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.423275,0.000000,3.607605,0.308035,1.50515,0.296422,0.282885,0.646030
748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.616070,0.00000,0.296422,0.000000,0.129206


## Calculate the weighted average of the word vectors (documents)

In [43]:
weighted_word_vectors = []
weight_sum = 0
doc_vectors_weighted = {}
for index, document in documents['preprocessed_text'].items():
 for word in document:
   if word in model.wv.key_to_index:
     weight = tfidf[word][index]
     weighted_word_vectors.append(model.wv[word] * weight)
     weight_sum += weight

 doc_vector = np.sum(weighted_word_vectors, axis=0) / weight_sum
 doc_vectors_weighted[documents.loc[index, 'doc_id']] = doc_vector # Use document_id as key

 # Reset for next doc
 weighted_word_vectors = []
 weight_sum = 0


  doc_vector = np.sum(weighted_word_vectors, axis=0) / weight_sum


In [44]:
print(doc_vectors)

{'2p7qrgx0': array([ 0.11841339,  0.01669252, -0.09465142,  0.17064585,  0.08429345,
       -0.22034793, -0.07071178,  0.13609165, -0.03515284, -0.01695184,
        0.01846288, -0.05304182,  0.1946722 ,  0.1117339 ,  0.04944272,
        0.03732943,  0.10177168, -0.00217721, -0.12141193,  0.01405607,
        0.23537448, -0.06648062,  0.16895652,  0.06783279, -0.01862947,
        0.10342696,  0.10911967, -0.2767583 , -0.12151047, -0.12410448,
       -0.03416025, -0.01117583,  0.13745782,  0.01333167,  0.22850232,
        0.15118149,  0.02547139,  0.14691967,  0.00340421, -0.05280145,
        0.13166675,  0.02989614,  0.11691423,  0.17732073,  0.25101954,
        0.14690569,  0.03933202,  0.12270364,  0.5249039 ,  0.06576829,
        0.19632179, -0.00570116, -0.08442872,  0.12502344,  0.07136313,
       -0.31758076,  0.07360259,  0.05854709, -0.07248177,  0.05725393,
       -0.0922849 ,  0.01526879, -0.03579535, -0.03393559,  0.0384716 ,
       -0.17348067,  0.11172986,  0.016649  ,  0.01

## Calculate Term Frequency (queries)

In [45]:

queries_term_frequency = {}

for i, row in   queries.iterrows():

    preprocessed_words = row['preprocessed_text']

    for word in preprocessed_words:

     if word in top_words:

        if word not in queries_term_frequency:
            queries_term_frequency[word] = {}
        queries_term_frequency[word][i] = calculate_term_frequncy(word, preprocessed_words) # calculate term frequency for queries

## calculate idf for queries

In [46]:
queries_idf = compute_idf(queries , queries_term_frequency) # calculate idf for queries

## calculate tf-idf for queries

In [47]:
queries_tfidf = calculate_tfidf(queries_term_frequency, queries_idf ,queries ,top_words )# calculate tf-idf for queries


## Calculate the weighted average of the word vectors (queries)

In [48]:
weighted_word_vectors=[]
weight_sum = 0
query_vectors_weighted = {}
for index, query in queries['preprocessed_text'].items():

  for word in query :
    if word in model.wv.key_to_index :
      weight = queries_tfidf[word][index]
      weight_sum += weight
      weighted_word_vectors.append(model.wv[word] * weight)

      # Calculate the weighted average of the word vectors

      query_vector = np.sum(weighted_word_vectors, axis=0) / weight_sum
      query_vectors_weighted[queries.loc[index, 'query_id']] = query_vector # Use query_id as key

      #reset for next quey
  weighted_word_vectors = []
  weight_sum = 0


In [49]:
print(query_vectors_weighted)

{1: array([-0.01401896, -0.05795791, -0.12757273, -0.02778928,  0.37406313,
       -0.14268655, -0.2318066 ,  0.23912098, -0.10368825,  0.09384166,
       -0.3274969 ,  0.12325451,  0.08066963, -0.02054494,  0.37836203,
        0.00367562,  0.02001205,  0.10357764,  0.04212765,  0.30376616,
        0.44632104, -0.07517045,  0.5450968 ,  0.38100004, -0.10770222,
        0.06809419,  0.02596502,  0.12239723, -0.14163938,  0.03533106,
       -0.11054648,  0.01800734,  0.26481044, -0.11253392,  0.09911695,
        0.3119177 ,  0.06628881,  0.2726013 ,  0.15931576,  0.07490164,
        0.0907158 ,  0.15177354,  0.23202236,  0.45842242,  0.12537529,
       -0.06988762,  0.16968596, -0.09082288,  0.5283908 , -0.08183175,
        0.38942975,  0.33953995,  0.08788013,  0.06131648,  0.31291962,
       -0.19884662, -0.12710409,  0.3935226 , -0.23318958,  0.13516203,
       -0.1746171 , -0.02569274, -0.2079413 ,  0.31292838, -0.2955054 ,
       -0.35774744,  0.3049461 ,  0.06963567, -0.02430684,  

## evaluate word2vec (weghted mean)

In [32]:
related_docs = {}
top_docs_weightd ={}
for query, query_vector in query_vectors_weighted.items():
 # Calculate cosine similarity between the query vector and each document vector
 similarities = {}
 for doc in doc_vectors_weighted.keys():
     # Check if the document vector is not NaN
     if not np.isnan(doc_vectors_weighted[doc]).any():
         similarity = cosine_similarity(query_vector, doc_vectors_weighted[doc])
         similarities[doc] = similarity

 # Sort the document vectors based on their cosine similarity to the query vector
 sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

 # top 10 most similar docs

 related_docs[query] = sorted_similarities[:10]
top_docs_weightd = {query: [doc_tuple[0] for doc_tuple in doc_list] for query, doc_list in related_docs.items()}

**print 10 best documents for a random query (weighted_mean word2vec model)**

In [33]:
# Select a random query
random_query_id = random.choice(list(top_docs_weightd.keys()))


original_query = queries.loc[queries['query_id'] == random_query_id, 'query'].values[0]

# Get the top 10 documents
top_10_documents = top_docs_weightd[random_query_id][:10]

print(f"For the query '{original_query}', the top 10 documents are: \n")
for doc_id in top_10_documents:
    document = documents.loc[documents['doc_id'] == doc_id]
    print(f"Document: {document['document'].values[0]}")


For the query 'what evidence is there for the value of hydroxychloroquine in treating Covid-19?', the top 10 documents are: 

Document: Given the extreme importance of the current pandemic caused by COVID-19, and as scientists agree there is no identified pharmacological treatment, where possible, therapeutic alternatives are raised through drug repositioning. This paper presents a selection of studies involving drugs from different pharmaceutical classes with activity against SARS-CoV-2 and SARS-CoV, with the potential for use in the treatment of COVID-19 disease.
Document: The unexpected pandemic set off by the novel coronavirus 2019 (COVID-19) has caused severe panic among people worldwide COVID-19 has created havoc, and scientists and physicians are urged to test the efficiency and safety of drugs used to treat this disease In such a pandemic situation, various steps have been taken by government to control and prevent the Severe Acute Respiratory Syndrome coronavirus 2 (SARS-CoV-2

In [34]:
p_at_5_weighted_mean = calculate_precision_at_k(top_docs_weightd, releation, 5)

p_at_10_weighted_mean  = calculate_precision_at_k(top_docs_weightd, releation, 10)

map_weighted_mean = calculate_map(top_docs_weightd, releation)

mrr_weighted_mean = calculate_mrr(top_docs_weightd, releation)


print(f"P@5 with weighted mean word2vec model: {p_at_5_weighted_mean}")
print(f"P@10 with unigram model: {p_at_10_weighted_mean}")
print(f"Mean Average Precision (MAP) with weighted mean word2vec model: {map_weighted_mean}")
print(f"Mean Reciprocal Rank (MRR) with weighted mean word2vec model: {mrr_weighted_mean}")


P@5 with weighted mean word2vec model: 0.5726666666666667
P@10 with unigram model: 0.44195952380952386
Mean Average Precision (MAP) with weighted mean word2vec model: 0.2745814493575208
Mean Reciprocal Rank (MRR) with weighted mean word2vec model: 0.8888888888888887
