In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from scipy import sparse
import csv
import pickle
import numpy as np
from gensim.models import Word2Vec
import nltk
from sklearn.preprocessing import normalize


In [2]:
clean_docs = pd.read_csv('clinical_dataset/processed_docs.csv')
clean_docs = clean_docs.fillna('')
original_docs = pd.read_csv('clinical_dataset/original_docs.csv')
original_docs = original_docs.fillna('')
clean_queries = pd.read_csv("clinical_dataset/original_queries.csv")
orginal_qrles = pd.read_csv("clinical_dataset/qrels.csv")

In [3]:
def add_doc_to_query_link(query_id, doc_id, links):
  """
  This function adds a doc ID to the list associated with a query ID in the links dictionary.

  Args:
      query_id: The query ID to associate the doc ID with.
      doc_id: The document ID to add.
      links: A dictionary where keys are query IDs and values are lists of doc IDs.

  Returns:
      The updated links dictionary.
  """
  # Ensure links is a dictionary (avoids potential errors)
  links = {} if links is None else links

  # Check if the query ID exists as a key
  if query_id not in links:
    links[query_id] = []  # Create an empty list for the new query ID

  links[query_id].append(doc_id)  # Append the doc ID to the list
  return links

In [5]:
links={}
for row in orginal_qrles.iterrows():
    if(row[1].relevance>0):
      add_doc_to_query_link(row[1].query_id,row[1].doc_id,links)


In [6]:
with open('clinical_dataset/word_embedding/tokenized_docs.pkl', 'rb') as file:
    tokenized_docs = pickle.load(file)

In [7]:
with open('clinical_dataset/word_embedding/tokenized_queries.pkl', 'rb') as file:
    tokenized_queries = pickle.load(file)


In [14]:
with open('clinical_dataset/word_embedding/model.pkl', 'rb') as file:
    model = pickle.load(file)

In [9]:
doc_vectors = np.load('clinical_dataset/word_embedding/documents_vectors_model.npy')

In [16]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="5ba8ab95-63a1-437d-ac9f-4a2b9d830981")
index = pc.Index("clinical-word-embedding-3-fields")

In [10]:
# Get the document IDs
ids = [str(doc['doc_id']) for doc in tokenized_docs]
print(ids[0])

# Convert vectors to list of lists (if necessary)
embeddings = [vector.tolist() for vector in doc_vectors]
print(embeddings[0])
print(doc_vectors[0])

NCT00530868
[-0.04870382696390152, 0.1619778275489807, -0.3358715772628784, -0.03301163762807846, -0.03285294398665428, -0.024068225175142288, 0.45147719979286194, 0.9515848159790039, -0.6469638347625732, -0.04030335322022438, 0.12247101217508316, -0.4219755232334137, 0.5009117126464844, 0.25982990860939026, 0.38483184576034546, 0.12761475145816803, 0.21880419552326202, 0.006399335339665413, -0.09838926792144775, -0.9525341391563416, -0.0422588586807251, -0.08502931147813797, 0.23743434250354767, -0.3247246742248535, -0.290843665599823, -0.4273015856742859, -0.14602793753147125, -0.29583939909935, -0.5378186106681824, -0.09345812350511551, 0.7275196313858032, -0.3042389154434204, 0.32696712017059326, -0.6900436878204346, -0.48004940152168274, 0.36626359820365906, 0.3994625210762024, 0.06170641630887985, -0.21783606708049774, -0.11129125952720642, 0.22401674091815948, -0.36760300397872925, -0.21755677461624146, 0.0921248346567154, -0.19327744841575623, -0.15102948248386383, -0.293096899

In [12]:
batch_size = 500

# Calculate the number of batches
num_batches = len(ids) // batch_size + (1 if len(ids) % batch_size != 0 else 0)

# Upload vectors in batches
for i in range(num_batches):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, len(ids))
    
    batch_ids = ids[start_index:end_index]
    batch_vectors = doc_vectors[start_index:end_index]
    
    vectors_to_upsert = [{"id": id_, "values": vec.tolist()} for id_, vec in zip(batch_ids, batch_vectors)]
    
    # Upsert the batch
    index.upsert(vectors=vectors_to_upsert)
    
    print(f"Batch {i + 1}/{num_batches} upserted successfully.")

Batch 1/483 upserted successfully.
Batch 2/483 upserted successfully.
Batch 3/483 upserted successfully.
Batch 4/483 upserted successfully.
Batch 5/483 upserted successfully.
Batch 6/483 upserted successfully.
Batch 7/483 upserted successfully.
Batch 8/483 upserted successfully.
Batch 9/483 upserted successfully.
Batch 10/483 upserted successfully.
Batch 11/483 upserted successfully.
Batch 12/483 upserted successfully.
Batch 13/483 upserted successfully.
Batch 14/483 upserted successfully.
Batch 15/483 upserted successfully.
Batch 16/483 upserted successfully.
Batch 17/483 upserted successfully.
Batch 18/483 upserted successfully.
Batch 19/483 upserted successfully.
Batch 20/483 upserted successfully.
Batch 21/483 upserted successfully.
Batch 22/483 upserted successfully.
Batch 23/483 upserted successfully.
Batch 24/483 upserted successfully.
Batch 25/483 upserted successfully.
Batch 26/483 upserted successfully.
Batch 27/483 upserted successfully.
Batch 28/483 upserted successfully.
B

In [8]:
def runQuery(tokenized_query):
    query_vector = []

    # Extract vectors for all query terms
    for word in tokenized_query['query_disease']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
            
    for word in tokenized_query['query_gene']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
            
    for word in tokenized_query['query_demographic']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
            
    for word in tokenized_query['query_other']:
        if word in model.wv:
            word_vector = model.wv.get_vector(word)
            query_vector.append(word_vector)
            
    # Calculate the mean vector
    query_vector = sum(query_vector) / len(query_vector)
    print(len(query_vector))
    
    # Normalize the query vector
    query_vector = normalize([query_vector])[0]
    
    # Ensure query vector is in list format and values are floats
    query_vector = [float(x) for x in query_vector]
    
    # Query Pinecone index
    results = index.query(vector=query_vector, top_k=10, include_values=True)
    
    # Retrieve top-k results, keeping IDs as strings
    
    top_k_results = [doc['id'] for doc in results['matches']]
    
    
    return top_k_results

In [9]:
def get_relevant_docs_from_qrels(query_id):
    if query_id not in links:
        print(f"Query ID {query_id} not found in links.")
        return set()  # Return an empty set or handle as needed

    key = query_id  # The key as an integer
    key_str = str(key)  # Convert the integer key to a string

    value = links[query_id]  # Accessing the value using the string key
    return value


In [10]:
def calculate_mrr(relevant_docs, retrieved_docs):
    """
    Calculate the reciprocal rank for a single query.
    
    :param relevant_docs: A set of relevant document IDs.
    :param retrieved_docs: A list of retrieved document IDs.
    :return: The reciprocal rank for the query.
    """
    for rank, doc_id in enumerate(retrieved_docs, start=1):
        if doc_id in relevant_docs:
            return 1 / rank
    return 0

In [11]:
def calculate_map(relevant_documents_id, retrieved_documents_id, k):
    precision_sum = 0.0
    relevant_count = 0

    if len(relevant_documents_id) == 0 or len(retrieved_documents_id) == 0:
        return 0.0

    retrieved_documents_id = retrieved_documents_id[:k]

    for i, doc_id in enumerate(retrieved_documents_id, 1):
        if doc_id in relevant_documents_id:
            relevant_count += 1
            precision_sum += relevant_count / i

    avg_precision = precision_sum / min(len(relevant_documents_id), k)
    return avg_precision

In [12]:
def myRunQueries(tokenized_queries):
    ap_at_ak = 0
    total_reciprocal_rank = 0
    
    for query in tokenized_queries:
        print("Query ID:", query['query_id'])
        
        relevant = get_relevant_docs_from_qrels(query['query_id'])
        retrieved = runQuery(query)
        
        
        retrieved_ids = []
        for doc in retrieved:
            retrieved_ids.append(doc)
            
        print("relevant: ", relevant)
        print("retrieved: ", retrieved_ids)
        
        map_score = calculate_map(relevant, retrieved_ids, 10)
        print("MAP:", map_score)
        print('-------------------------------------------------------------------------')

        ap_at_ak += map_score
        
        reciprocal_rank = calculate_mrr(relevant, retrieved_ids)
        total_reciprocal_rank += reciprocal_rank
    
    map_result = ap_at_ak / len(tokenized_queries)
    mrr = total_reciprocal_rank / len(tokenized_queries)
    
    print("Mean Average Precision (MAP):", map_result)
    print("Mean Reciprocal Rank (MRR):", mrr)
    
    return map_result

In [17]:
finalResult = myRunQueries(tokenized_queries)


Query ID: 1
100
relevant:  ['NCT00445783', 'NCT01209598', 'NCT01237236', 'NCT01522989', 'NCT01692496', 'NCT02022982', 'NCT02065063', 'NCT02187783', 'NCT02414724', 'NCT02418234', 'NCT02571829', 'NCT02693535', 'NCT02846987', 'NCT02897375', 'NCT02919696', 'NCT03065062', 'NCT03096912']
retrieved:  ['NCT01987518', 'NCT02973763', 'NCT00748709', 'NCT01312337', 'NCT01106781', 'NCT01514864', 'NCT01697072', 'NCT01704703', 'NCT02438007', 'NCT02404675']
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
100
relevant:  ['NCT00006103', 'NCT00079274', 'NCT00134069', 'NCT00265850', 'NCT00405587', 'NCT00418938', 'NCT00444678', 'NCT00551421', 'NCT00598975', 'NCT00637091', 'NCT00640471', 'NCT00655499', 'NCT00660582', 'NCT00755534', 'NCT00778830', 'NCT00813605', 'NCT00819780', 'NCT00826540', 'NCT00827684', 'NCT00842257', 'NCT00853931', 'NCT00856375', 'NCT00879385', 'NCT00880321', 'NCT00888134', 'NCT00897429', 'NCT00940316', 'NCT00942266', 'NCT00954876', 'NCT0095