In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from scipy import sparse
import csv
import pickle
import numpy as np
from gensim.models import Word2Vec
import nltk

In [17]:
clean_docs = pd.read_csv('clinical_dataset/processed_docs.csv')
clean_docs = clean_docs.fillna('')
original_docs = pd.read_csv('clinical_dataset/original_docs.csv')
original_docs = original_docs.fillna('')
clean_queries = pd.read_csv("clinical_dataset/original_queries.csv")
orginal_qrles = pd.read_csv("clinical_dataset/qrels.csv")

In [18]:
def add_doc_to_query_link(query_id, doc_id, links):
  """
  This function adds a doc ID to the list associated with a query ID in the links dictionary.

  Args:
      query_id: The query ID to associate the doc ID with.
      doc_id: The document ID to add.
      links: A dictionary where keys are query IDs and values are lists of doc IDs.

  Returns:
      The updated links dictionary.
  """
  # Ensure links is a dictionary (avoids potential errors)
  links = {} if links is None else links

  # Check if the query ID exists as a key
  if query_id not in links:
    links[query_id] = []  # Create an empty list for the new query ID

  links[query_id].append(doc_id)  # Append the doc ID to the list
  return links

In [19]:

links={}
for row in orginal_qrles.iterrows():
    if(row[1].relevance>0):
      add_doc_to_query_link(row[1].query_id,row[1].doc_id,links)


In [3]:
# Get the document IDs and contents as separate lists
doc_ids = clean_docs['doc_id'].tolist()
doc_titles = clean_docs['title'].tolist()
doc_summaries = clean_docs['summary'].tolist()
doc_detailed_descriptions = clean_docs['detailed_description'].tolist()


# Create a list of dictionaries for tokenized_docs
tokenized_docs = []

for doc_id, doc_title,doc_summary,doc_detailed_description  in zip(doc_ids, doc_titles,doc_summaries, doc_detailed_descriptions):
    tokenized_doc = {
        'doc_id': doc_id,
        'title': nltk.word_tokenize(doc_title),
        'summary' : nltk.word_tokenize(doc_summary),
        'detailed_description' : nltk.word_tokenize(doc_detailed_description),
    }
    tokenized_docs.append(tokenized_doc)


In [4]:
with open("clinical_dataset/word_embedding/tokenized_docs.pkl", 'wb') as file:
    pickle.dump(tokenized_docs, file)
file.close()

In [2]:
with open('clinical_dataset/word_embedding/tokenized_docs.pkl', 'rb') as file:
    tokenized_docs = pickle.load(file)

In [6]:
tokenized_docs_content = [doc['title']+doc['summary']+doc['detailed_description'] for doc in tokenized_docs]

In [7]:
with open("clinical_dataset/word_embedding/tokenized_docs_content.pkl", 'wb') as file:
    pickle.dump(tokenized_docs_content, file)
file.close()

In [3]:
with open('clinical_dataset/word_embedding/tokenized_docs_content.pkl', 'rb') as file:
    tokenized_docs_content = pickle.load(file)

In [39]:
model = Word2Vec(tokenized_docs_content, min_count = 2, vector_size = 100, window = 20, sg = 1)
with open("clinical_dataset/word_embedding/model_min_count=2,vector_size=100,window=20,sg=1.pkl", 'wb') as file:
    pickle.dump(model, file)
file.close()

In [48]:
with open('clinical_dataset/word_embedding/model.pkl', 'rb') as file:
    model = pickle.load(file)
    

In [49]:
def preprocess_documents_m2(tokenized_docs):
    doc_vectors = []
    for doc in tokenized_docs:
        doc_vector = []
        for word in doc['title']:
            if word in model.wv:
                word_vector = model.wv.get_vector(word) * 3
                doc_vector.append(word_vector)
        
        for word in doc['summary']:
            if word in model.wv:
                word_vector = model.wv.get_vector(word) * 2
                doc_vector.append(word_vector)
        
        for word in doc['detailed_description']:
            if word in model.wv:
                word_vector = model.wv.get_vector(word) * 1
                doc_vector.append(word_vector)
        
        if len(doc_vector) > 0 :
            doc_vector= sum(doc_vector)/len(doc_vector)
            doc_vectors.append(doc_vector)
        else:
            print(doc['doc_id'])
    np.save("clinical_dataset/word_embedding/documents_vectors_model.npy",doc_vectors)


In [50]:
preprocess_documents_m2(tokenized_docs)

KeyboardInterrupt: 

In [None]:
query_ids = clean_queries['query_id'].tolist()
query_diseases = clean_queries['disease'].tolist()
query_genes = clean_queries['gene'].tolist()
query_demographics = clean_queries['demographic'].tolist()
query_others = clean_queries['other'].tolist()

tokenized_queries = []

for query_id, query_disease, query_gene, query_demographic, query_other in zip(query_ids, query_diseases, query_genes, query_demographics, query_others):
    
    query_other = str(query_other)
    if(query_other=='nan'):
        query_other = ''
    tokenized_query = {
        'query_id': query_id,
        'query_disease': nltk.word_tokenize(query_disease),
        'query_gene': nltk.word_tokenize(query_gene),
        'query_demographic': nltk.word_tokenize(query_demographic),
        'query_other': nltk.word_tokenize(query_other),
    }
    # print(tokenized_query)
    tokenized_queries.append(tokenized_query)
    
with open("clinical_dataset/word_embedding/tokenized_queries.pkl", 'wb') as file:
    pickle.dump(tokenized_queries, file)
file.close()


In [None]:
with open('clinical_dataset/word_embedding/tokenized_queries.pkl', 'rb') as file:
    tokenized_queries = pickle.load(file)


In [None]:
print(tokenized_queries[0])

In [51]:
def runQuery(tokenized_query):
    
    query_vector = []

    for word in tokenized_query['query_disease']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
            
    for word in tokenized_query['query_gene']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
            
    for word in tokenized_query['query_demographic']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
    for word in tokenized_query['query_other']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
            
        
    query_vector = sum(query_vector)/len(query_vector)
    
    doc_vectors = np.load("clinical_dataset/word_embedding/documents_vectors_model.npy")
    
    similarity_scores = cosine_similarity([query_vector], doc_vectors)
    
    # this result contains objects like this (3796, 0.9705645380366313) first attribute (index in realted docs)
    # second attribute (similarity score)
    results = list(enumerate(similarity_scores[0]))
    
    # sort results by score (descending) from higher score to lower score 
    sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
    
    result = []
    i=0
    for res in sorted_results:
        if i <10:
            result.append(tokenized_docs[res[0]])
            i+=1
        else:
            break
    return result[:10]

In [52]:
def get_relevant_docs_from_qrels(query_id):
    if query_id not in links:
        print(f"Query ID {query_id} not found in links.")
        return set()  # Return an empty set or handle as needed

    key = query_id  # The key as an integer
    key_str = str(key)  # Convert the integer key to a string

    value = links[query_id]  # Accessing the value using the string key
    return value


In [53]:
def calculate_mrr(relevant_docs, retrieved_docs):
    """
    Calculate the reciprocal rank for a single query.
    
    :param relevant_docs: A set of relevant document IDs.
    :param retrieved_docs: A list of retrieved document IDs.
    :return: The reciprocal rank for the query.
    """
    for rank, doc_id in enumerate(retrieved_docs, start=1):
        if doc_id in relevant_docs:
            return 1 / rank
    return 0

In [54]:
def calculate_map(relevant_documents_id, retrieved_documents_id, k):
    precision_sum = 0.0
    relevant_count = 0

    if len(relevant_documents_id) == 0 or len(retrieved_documents_id) == 0:
        return 0.0

    retrieved_documents_id = retrieved_documents_id[:k]

    for i, doc_id in enumerate(retrieved_documents_id, 1):
        if doc_id in relevant_documents_id:
            relevant_count += 1
            precision_sum += relevant_count / i

    avg_precision = precision_sum / min(len(relevant_documents_id), k)
    return avg_precision

In [55]:
def myRunQueries(tokenized_queries):
    ap_at_ak = 0
    total_reciprocal_rank = 0
    
    for query in tokenized_queries:
        print("Query ID:", query['query_id'])
        
        relevant = get_relevant_docs_from_qrels(query['query_id'])
        retrieved = runQuery(query)
        retrieved_ids = []
        for doc in retrieved:
            retrieved_ids.append(doc['doc_id'])
            
        print("relevant: ", relevant)
        print("retrieved: ", retrieved_ids)
        
        map_score = calculate_map(relevant, retrieved_ids, 10)
        print("MAP:", map_score)
        print('-------------------------------------------------------------------------')

        ap_at_ak += map_score
        
        reciprocal_rank = calculate_mrr(relevant, retrieved_ids)
        total_reciprocal_rank += reciprocal_rank
    
    map_result = ap_at_ak / len(tokenized_queries)
    mrr = total_reciprocal_rank / len(tokenized_queries)
    
    print("Mean Average Precision (MAP):", map_result)
    print("Mean Reciprocal Rank (MRR):", mrr)
    
    return map_result

In [47]:
finalResult = myRunQueries(tokenized_queries)
# model_min_count=2,vector_size=100,window=20,sg=1

Query ID: 1
relevant:  ['NCT00445783', 'NCT01209598', 'NCT01237236', 'NCT01522989', 'NCT01692496', 'NCT02022982', 'NCT02065063', 'NCT02187783', 'NCT02414724', 'NCT02418234', 'NCT02571829', 'NCT02693535', 'NCT02846987', 'NCT02897375', 'NCT02919696', 'NCT03065062', 'NCT03096912']
retrieved:  ['NCT01987518', 'NCT00748709', 'NCT02973763', 'NCT01861197', 'NCT01704703', 'NCT02450136', 'NCT02241720', 'NCT01100827', 'NCT00708214', 'NCT02094261']
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  ['NCT00006103', 'NCT00079274', 'NCT00134069', 'NCT00265850', 'NCT00405587', 'NCT00418938', 'NCT00444678', 'NCT00551421', 'NCT00598975', 'NCT00637091', 'NCT00640471', 'NCT00655499', 'NCT00660582', 'NCT00755534', 'NCT00778830', 'NCT00813605', 'NCT00819780', 'NCT00826540', 'NCT00827684', 'NCT00842257', 'NCT00853931', 'NCT00856375', 'NCT00879385', 'NCT00880321', 'NCT00888134', 'NCT00897429', 'NCT00940316', 'NCT00942266', 'NCT00954876', 'NCT00959127', '

In [56]:
finalResult = myRunQueries(tokenized_queries)
# before improving

Query ID: 1
relevant:  ['NCT00445783', 'NCT01209598', 'NCT01237236', 'NCT01522989', 'NCT01692496', 'NCT02022982', 'NCT02065063', 'NCT02187783', 'NCT02414724', 'NCT02418234', 'NCT02571829', 'NCT02693535', 'NCT02846987', 'NCT02897375', 'NCT02919696', 'NCT03065062', 'NCT03096912']
retrieved:  ['NCT01987518', 'NCT02973763', 'NCT01312337', 'NCT00748709', 'NCT01106781', 'NCT01514864', 'NCT01697072', 'NCT02438007', 'NCT01704703', 'NCT02053636']
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  ['NCT00006103', 'NCT00079274', 'NCT00134069', 'NCT00265850', 'NCT00405587', 'NCT00418938', 'NCT00444678', 'NCT00551421', 'NCT00598975', 'NCT00637091', 'NCT00640471', 'NCT00655499', 'NCT00660582', 'NCT00755534', 'NCT00778830', 'NCT00813605', 'NCT00819780', 'NCT00826540', 'NCT00827684', 'NCT00842257', 'NCT00853931', 'NCT00856375', 'NCT00879385', 'NCT00880321', 'NCT00888134', 'NCT00897429', 'NCT00940316', 'NCT00942266', 'NCT00954876', 'NCT00959127', '