In [1]:
import pandas as pd
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import string
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from scipy import sparse
import csv
import pickle

In [2]:
clean_docs = pd.read_csv('clinical_dataset/processed_docs.csv')
clean_docs = clean_docs.fillna('')
original_docs = pd.read_csv('clinical_dataset/original_docs.csv')
original_docs = original_docs.fillna('')
clean_queries = pd.read_csv("clinical_dataset/original_queries.csv")
orginal_qrles = pd.read_csv("clinical_dataset/qrels.csv")

In [3]:
def add_doc_to_query_link(query_id, doc_id, links):
  """
  This function adds a doc ID to the list associated with a query ID in the links dictionary.

  Args:
      query_id: The query ID to associate the doc ID with.
      doc_id: The document ID to add.
      links: A dictionary where keys are query IDs and values are lists of doc IDs.

  Returns:
      The updated links dictionary.
  """
  # Ensure links is a dictionary (avoids potential errors)
  links = {} if links is None else links

  # Check if the query ID exists as a key
  if query_id not in links:
    links[query_id] = []  # Create an empty list for the new query ID

  links[query_id].append(doc_id)  # Append the doc ID to the list
  return links

In [4]:

links={}
for row in orginal_qrles.iterrows():
    if(row[1].relevance>0):
      add_doc_to_query_link(row[1].query_id,row[1].doc_id,links)


In [5]:

print(links[2])

['NCT00006103', 'NCT00079274', 'NCT00134069', 'NCT00265850', 'NCT00405587', 'NCT00418938', 'NCT00444678', 'NCT00551421', 'NCT00598975', 'NCT00637091', 'NCT00640471', 'NCT00655499', 'NCT00660582', 'NCT00755534', 'NCT00778830', 'NCT00813605', 'NCT00819780', 'NCT00826540', 'NCT00827684', 'NCT00842257', 'NCT00853931', 'NCT00856375', 'NCT00879385', 'NCT00880321', 'NCT00888134', 'NCT00897429', 'NCT00940316', 'NCT00942266', 'NCT00954876', 'NCT00959127', 'NCT00974389', 'NCT00975897', 'NCT01068132', 'NCT01082757', 'NCT01085331', 'NCT01086267', 'NCT01097018', 'NCT01108107', 'NCT01109615', 'NCT01110785', 'NCT01116271', 'NCT01134666', 'NCT01143753', 'NCT01149434', 'NCT01151007', 'NCT01152437', 'NCT01167725', 'NCT01190462', 'NCT01198535', 'NCT01198743', 'NCT01215539', 'NCT01231568', 'NCT01231594', 'NCT01243372', 'NCT01254617', 'NCT01260415', 'NCT01262963', 'NCT01280643', 'NCT01281761', 'NCT01287130', 'NCT01340833', 'NCT01340846', 'NCT01352273', 'NCT01358812', 'NCT01375816', 'NCT01387880', 'NCT01390

In [6]:
def dataProcessing(dfText):
    # spell correct
    # texts = " ".join([spell(w)for w in (word_tokenize(str(dfText)))])
    # To lower case
    texts = dfText.str.lower()
    
    # Remove punctuation
    trans_table = str.maketrans(string.punctuation, ' '*len(string.punctuation))  
    texts = [str(word).translate(trans_table) for word in texts]

    # Remove stopwrods
    stop_words = set(stopwords.words('english'))
    texts = [[word for word in q.split() if word not in stop_words] for q in texts]
    
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    lemmatized_strings = []
    for text in texts:
        words = word_tokenize(str(text))
        lemmatized_words = []
        for word in text:
            x = pos_tag([word])
            my_pos = wordnet.NOUN
            if x[0][1][0].lower() == 'v':
                my_pos = wordnet.VERB
            lemmatized_words.append(lemmatizer.lemmatize(word, pos = my_pos))
        lemmatized_strings.append(' '.join(lemmatized_words))
    texts = lemmatized_strings

    
    # Remove Non-alphanumeric Characters
    texts = [re.compile('[^a-zA-Z0-9\s]').sub('', str(word)) for word in texts]
    
    return texts

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')

In [8]:
vectorized_docs = vectorizer.fit_transform(clean_docs['title']+'  ' + clean_docs['condition']+'  '  + '  '+clean_docs['summary']+ '  '+ clean_docs['detailed_description']+'  '+clean_docs['summary']+'  ' + clean_docs['eligibility'])

In [9]:
# Store the vectorizer object
with open('clinical_dataset/objects/vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Store the vectorized_docs object
with open('clinical_dataset/objects/vectorized_docs.pkl', 'wb') as file:
    pickle.dump(vectorized_docs, file)

In [10]:
# Load the vectorizer object
with open('clinical_dataset/objects/vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

# Load the vectorized_docs object
with open('clinical_dataset/objects/vectorized_docs.pkl', 'rb') as file:
    vectorized_docs = pickle.load(file)

In [11]:
def runQueryWithAllDocsVecor(query):
   
    vectorized_query = vectorizer.transform([query[0]])

    #Calculate cosine similarity for vectorized_query and vectorized_docs
    similarity_scores = cosine_similarity(vectorized_query, vectorized_docs)
    
    # this result contains objects like this (3796, 0.9705645380366313) first attribute (index in realted docs)
    # second attribute (similarity score)
    results = list(enumerate(similarity_scores[0]))
    
    # sort results by score (descending) from higher score to lower score 
    sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
    
    # fetch real docs with ids from original docs and scores for each doc
    
    result = []
    i=0
    for res in sorted_results:
        if i <10:
            result.append(original_docs['doc_id'][res[0]])
            i+=1
        else:
            break
    return result[:10]

In [12]:
def get_relevent_docs_from_qrels(query_id):
    if query_id not in links:
        print(f"Query ID {query_id} not found in links.")
        return set()  # Return an empty set or handle as needed

    key = query_id  # The key as an integer
    key_str = str(key)  # Convert the integer key to a string

    value = links[query_id]  # Accessing the value using the string key
    return value


In [14]:
links[1]

['NCT00445783',
 'NCT01209598',
 'NCT01237236',
 'NCT01522989',
 'NCT01692496',
 'NCT02022982',
 'NCT02065063',
 'NCT02187783',
 'NCT02414724',
 'NCT02418234',
 'NCT02571829',
 'NCT02693535',
 'NCT02846987',
 'NCT02897375',
 'NCT02919696',
 'NCT03065062',
 'NCT03096912']

In [13]:
def myRunQueries(queries):
    ap_at_ak = 0
    total_reciprocal_rank = 0
    
    for query in queries:
        print("query id: ", query[0])
        
        relevant = get_relevent_docs_from_qrels((query[0]))
        retrieved = runQueryWithAllDocsVecor([query[1]+" "+query[2]] )
        
        print("relevant: ", relevant)
        print("retrieved: ", retrieved)
        
        map_score = calculate_map(relevant, retrieved, 10)
        print("MAP:", map_score)
        print('-------------------------------------------------------------------------')

        ap_at_ak += map_score
        
        reciprocal_rank = calculate_mrr(relevant, retrieved)
        total_reciprocal_rank += reciprocal_rank
    
    map_result = ap_at_ak / len(queries)
    mrr = total_reciprocal_rank / len(queries)
    
    print("Mean Average Precision (MAP):", map_result)
    print("Mean Reciprocal Rank (MRR):", mrr)
    
    return map_result

In [14]:
def calculate_map(relevant_documents_id, retrieved_documents_id, k):
    precision_sum = 0.0
    relevant_count = 0

    if len(relevant_documents_id) == 0 or len(retrieved_documents_id) == 0:
        return 0.0

    # Restrict the retrieved documents to the top k
    retrieved_documents_id = retrieved_documents_id[:k]

    for i, doc_id in enumerate(retrieved_documents_id, 1):
        if doc_id in relevant_documents_id:
            relevant_count += 1
            precision_sum += relevant_count / i

    avg_precision = precision_sum / min(len(relevant_documents_id), k)
    return avg_precision

In [15]:
def calculate_mrr(relevant_docs, retrieved_docs):
    """
    Calculate the reciprocal rank for a single query.
    
    :param relevant_docs: A set of relevant document IDs.
    :param retrieved_docs: A list of retrieved document IDs.
    :return: The reciprocal rank for the query.
    """
    for rank, doc_id in enumerate(retrieved_docs, start=1):
        if doc_id in relevant_docs:
            return 1 / rank
    return 0

In [16]:
print(myRunQueries(clean_queries.values))


query id:  1
relevant:  ['NCT00445783', 'NCT01209598', 'NCT01237236', 'NCT01522989', 'NCT01692496', 'NCT02022982', 'NCT02065063', 'NCT02187783', 'NCT02414724', 'NCT02418234', 'NCT02571829', 'NCT02693535', 'NCT02846987', 'NCT02897375', 'NCT02919696', 'NCT03065062', 'NCT03096912']
retrieved:  ['NCT03034252', 'NCT01209598', 'NCT02571829', 'NCT01692496', 'NCT03096912', 'NCT00004180', 'NCT01237236', 'NCT02187783', 'NCT03024489', 'NCT02343172']
MAP: 0.41809523809523813
-------------------------------------------------------------------------
query id:  2
relevant:  ['NCT00006103', 'NCT00079274', 'NCT00134069', 'NCT00265850', 'NCT00405587', 'NCT00418938', 'NCT00444678', 'NCT00551421', 'NCT00598975', 'NCT00637091', 'NCT00640471', 'NCT00655499', 'NCT00660582', 'NCT00755534', 'NCT00778830', 'NCT00813605', 'NCT00819780', 'NCT00826540', 'NCT00827684', 'NCT00842257', 'NCT00853931', 'NCT00856375', 'NCT00879385', 'NCT00880321', 'NCT00888134', 'NCT00897429', 'NCT00940316', 'NCT00942266', 'NCT00954876'

In [96]:
print(len(links[5]))

36


In [None]:
print((clean_docs[]['title']+' '+clean_docs['summary']+' '+clean_docs['detailed_description'])