## TREC 2021 Clinical Trials Track Task

The solution to the this task has the following main steps:
- create an index mapping for the Clinical Trials Track documents


- index the documents from the Clinical Trials Track into this index (use only the documents with qrel)


- index the 75 queries into a different index, but with the same mapping structure


- for each of the 75 queries, extract the top 10k documents for that query and copy all of the documents into a new index. This index is connected to the query and has the name format: "10k_query_" + str(i), where i is the query number/position


- after each batch of 10k documents has been copied to it's respective query index, compute the df value, for each token in the queries. This is accomplished by using get_df_ttf(). These values will be later used for the computation of the BM25 variants.


- after obtaining the df values, the next values to be determined are the tf values. Tf is the term frequency of term t, in document d. This means that in order to get the tf values from the ElasticSearch API, we need to query each document individually. To do so, iteratively:
       1) we take each document from the 10k collection
       2) we create a 'disposable' index into which we index the document
       3) we query the 'disposable' index and get the tf values
       4) we delete the 'disposable' index
  
  
- because the above step takes a lot of time (approx. 3 hours), the process is repeated for one query. However, the entire pipeline can be repeated for each query in the same way.

In [1]:
from datetime import datetime
from elasticsearch import Elasticsearch
from tqdm import tqdm

es = Elasticsearch(HOST="http://localhost", PORT=9200, timeout=30)

In [2]:
# Define the index mapping
index_body = {
    "settings":{
        "index":{
        "number_of_shards":1,
        "number_of_replicas":1
        }
    },
    "mappings":{
        "properties":{
            "Content":{
                "type":"text",
                "fielddata":True,
                "term_vector": "with_positions_offsets_payloads",
                "store" : True,
                "analyzer" : "whitespace"
            }
        }
    }
}

In [3]:
# es.indices.create(index = "clinical_trials_track", body = index_body)

In [4]:
es.indices.exists(index = "clinical_trials_track")

True

### Import the Clinical Trials dataset

In [5]:
import ir_datasets

# load the dataset
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")

# get the doc_id of the documents that have qrel
docs_with_qrel = []
for doc in dataset.qrels_iter():
    docs_with_qrel.append(doc.doc_id)

In [6]:
# check the number of documents that have qrel
dataset.qrels_count()

35832

### Index the documents

In [6]:
# create a docstore to easily retrieve the date based on "doc_id"
docstore = dataset.docs_store()

# iterate over the docs having qrel
for docID in tqdm(docs_with_qrel):
    # retrieve the document
    doc = docstore.get(docID)
    
    # format the document
    doc_formatted = {
        'Content': doc.detailed_description,
    }
    
    # index it using elastic search
    es.index(index = "clinical_trials_track", id=docID, body=doc_formatted)

100%|██████████| 35832/35832 [46:22<00:00, 12.88it/s]  


### Index the queries

In [7]:
# get the querries
import csv

queries = []

with open("./queries_2021.tsv", encoding="utf8") as file:
    tsv_file = csv.reader(file, delimiter="\n")
     
    for q in tsv_file:
        queries.append(q)
        
queries = [q[0] for q in queries]

In [8]:
# process the querries 
formatted_queries = []

for q in queries:
    q = q.replace("\t", " ")
    formatted_queries.append(q)

In [9]:
# Create new index for queries
es.indices.create(index = "query_index", body = index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'query_index'}

In [10]:
for i, query_text in enumerate(formatted_queries):
    # format the document
    query_dict = {
        'Content': query_text,
    }
    
    query_id = 'query' + str(i) 
    
    # index the query using elastic search
    es.index(index = "query_index", id=query_id, body=query_dict)

### Given a query, get the top 10k documents and move them to a separate index 

In [49]:
# create a clone of the original index
es.indices.create(index = "clinical_trials_track_clone", body = index_body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'clinical_trials_track_clone'}

In [54]:
es.indices.exists(index = "clinical_trials_track_clone")

True

In [55]:
reindex_body = {
    "source": {
        "index": "clinical_trials_track",
    },
    "dest": {
        "index": "clinical_trials_track_clone"
    }
}

es.reindex(body = reindex_body)

{'took': 11269,
 'timed_out': False,
 'total': 26162,
 'updated': 26162,
 'created': 0,
 'deleted': 0,
 'batches': 27,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [13]:
# check to see if all elementes have been reindexed
k1 = es.count(index = "clinical_trials_track")['count']
k2 = es.count(index = "clinical_trials_track_clone")['count']

print(f"Elements in clinical_trials_track index: {k1}")
print(f"Elements in clinical_trials_track_clone index: {k2}")

Elements in clinical_trials_track index: 26162
Elements in clinical_trials_track_clone index: 26162


In [14]:
for i in range(len(formatted_queries)):
    new_index = '10k_query_' + str(i)
    es.indices.delete(index = new_index)

In [18]:
for i, query_text in tqdm(enumerate(formatted_queries)):
    new_index = '10k_query_' + str(i)
    
    # create the index
    es.indices.create(index = new_index, body = index_body)
    
    # extract the top 10k documents and reindex them into the new index
    reindex_body = {
        "max_docs": 10000,
        "source": {
            "index": "clinical_trials_track_clone",
            "query": {
                "bool": {
                    "should": [
                        {"match": 
                             {"Content": query_text}
                        }
                    ] 
                ,"minimum_should_match": 1,
                "boost": 1.0
                }
            }

        },
        "dest": {
            "index": new_index
        }
    }
    
    es.reindex(body = reindex_body)

75it [09:25,  7.55s/it]


The abovee cell has created, for each of the 75 querries, an index containing the top 10k documents retrieved from the main corpus for that specific query.

The name of the indexes is formatted in the following way: '10k_query_' + str(i), where i is the position of the query in the list formatted_queries.

## Compute BM25 Variants

DEFINITIONS:

- N is the number of documents in the collection
- df_t is the number of documents containing term t
- tf_td is the term frequency of term t in document d
- L_d is the number of tokens in document d
- L_avg is the average number of tokens in a document in the collection
- k1, b are free parameters that can be optimized per collection

In [24]:
# get the original data of the documents returned by the es.search() method
def get_original_data(query_position):
    bool_query = {
        "size": 10000,
        "query": {
            "bool": {
                "should": [
                    {"match": 
                         {"Content": formatted_queries[query_position]}
                    }
                ] 
            ,"minimum_should_match": 1,
            "boost": 1.0
            }
        }
    }
    
    index_name = '10k_query_' + str(query_position)
    
    search_returns = es.search(index = '10k_query_1',  body = bool_query)
    
    doc_ids_list = []
    doc_original_scores = []
    doc_contents = []
    
    for j in range(len(search_returns['hits']['hits'])):
        # save the id, original score and content of the to 10k documents
        current_id = search_returns['hits']['hits'][j]['_id']
        doc_ids_list.append(current_id)
        
        current_score = search_returns['hits']['hits'][j]['_score']
        doc_original_scores.append(current_score)
        
        current_doc_content = search_returns['hits']['hits'][j]['_source']
        doc_contents.append(current_doc_content)
        
    return [doc_ids_list, doc_original_scores, doc_contents]

In [72]:
# Compute the number of tokens per document (i.e. per query)
def compute_L_d(index_name, document_ID):
    res_dict = es.termvectors(index = index_name, id = document_ID)
    
    if 'term_vectors' in res_dict:
        L_d = len(res_dict['term_vectors']['Content']['terms'].keys())
    
    else:
        L_d = 0
    
    return L_d

In [46]:
def compute_L_avg(index_name, doc_ids_list):
    L_total = 0
    for i in range(len(doc_ids_list)):
        individual_term_vectors_dict = es.termvectors(index = index_name, id = doc_ids_list[i])
        
        current_length = len(dd['term_vectors']['Content']['terms'].keys())
        
        L_total += current_length
        
    L_avg = L_total/len(doc_ids_list)
    
    return L_avg        

In [38]:
# the number of documents in the collection is always 10k
N = 10000

# use pre-set values for k1 and b
k1 = 0.9
b = 0.4

### Compute ATIRE and Lucene score

In [39]:
import numpy as np
import pandas as pd

In [55]:
def get_df_ttf(query_text, index_name):
    
    # format the doc body for the es.termvectors() method
    doc = {
      "term_statistics" : True,
      "doc" : {
        "Content" : query_text
      }
    }

    result = es.termvectors(index = index_name, body = doc)
    
    terms_dic = result['term_vectors']['Content']['terms']
    
    total_df = result['term_vectors']['Content']['field_statistics']['sum_doc_freq']
    total_ttf = result['term_vectors']['Content']['field_statistics']['sum_ttf']
    # create a dictionary in which to save the df-value per term/token of query
    df_per_query_term = {}
    ttf_per_query_term = {}
    
    for term in terms_dic.keys():
        if 'doc_freq' in terms_dic[term]:
            df = terms_dic[term]['doc_freq'] / total_df
            ttf = terms_dic[term]['ttf'] / total_ttf
            
            df_per_query_term[str(term)] = df
            ttf_per_query_term[str(term)] = ttf
            
    return [df_per_query_term, ttf_per_query_term]

In [67]:
# Compute ATIRE score for each query and store them in a pandas dataframe
results_per_query = {}

## Results for query 1

In [None]:
for i in range(1,2):
    print(f'Starting computations for query: {i}')
    query_text = formatted_queries[i]
    index_name = '10k_query_' + str(i)
    
    b = 0.4
    k1 = 0.9
    N = 10000
    
    # ======== FIRST GET THE DATA THAT APPLIES TO THE ENTIRE CORPUS ========
    # get the original data from the es.search() method
    original_data = get_original_data(i)
    
    doc_ids_list = original_data[0]
    doc_original_scores = original_data[1]
    doc_original_contents = original_data[2]
    
    df_per_query = pd.DataFrame()
    df_per_query['doc_ids'] = doc_ids_list
    df_per_query['original_scores'] = doc_original_scores
    
    # Compute the average length of the documents:
    L_avg = compute_L_avg(index_name, doc_ids_list)
    
    # get the df-values for each term of the query (note: df values apply for the entire corpus of documents)
    [df_per_query_term, ttf_per_query_term] = get_df_ttf(query_text, index_name)
    
    # format the doc body for the es.termvectors() method
    doc_body = {
      "term_statistics" : True,
      "doc" : {
        "Content" : query_text
      }
    }
    
    # ======== ======== ======== ======== ======== ======== ======== ========
    
    # ======== NEXT GET THE DATA THAT IS COMPUTED PER DOCUMENT ========
   
    
    # for each document in the ranking of the query, re-compute the scores
    ATIRE_scores = []
    Lucene_accurate_scores = []
    
    for j in tqdm(range(len(doc_ids_list))):
        ATIRE_score_per_doc = 0
        Lucene_accurate_score_per_doc = 0
        
        docID = doc_ids_list[j]
        doc_content = doc_original_contents[j]
        
        # compute L_d
        L_d = compute_L_d(index_name, docID)
        
        # index the document content into a 'disposable index'
        es.indices.delete(index = 'disposable')
        es.indices.create(index = 'disposable', body = index_body)
        es.index(index = 'disposable', body = doc_content)

        disposable_result = es.termvectors(index = "disposable", body = doc_body)
        disposable_dict = disposable_result['term_vectors']['Content']['terms']
        
        for term in disposable_dict.keys():
            if term in df_per_query_term:
                df = df_per_query_term[term]
                ttf = ttf_per_query_term[term]
                
                tf = disposable_dict[term]['term_freq'] / ttf
    
                # Compute ATIRE score
                x1_ATIRE = np.log(N/df)
                x2_ATIRE = (tf*(k1+1)) / (tf + k1*(1 - b + b*(L_d/L_avg)))
                
                ATIRE_score_per_doc += x1_ATIRE * x2_ATIRE
                
                # Compute Lucene Accurate score
                x1_lucene = np.log(1+ (N - df + 0.5)/(df + 0.5))
                x2_lucene = tf / (tf + k1*(1 - b + b*(L_d/L_avg)))
                Lucene_accurate_score_per_doc += x1_lucene * x2_lucene
                
                
        
        # save the score per document:
        ATIRE_scores.append(ATIRE_score_per_doc) 
        Lucene_accurate_scores.append(Lucene_accurate_score_per_doc)
#         print(f"ATIRE_score_per_doc: {ATIRE_score_per_doc}, Lucene_accurate_score_per_doc: {Lucene_accurate_score_per_doc}")

#         es.indices.delete(index = 'disposable')
        
    df_per_query['ATIRE_scores'] = ATIRE_scores
    df_per_query['Lucene_accurate_scores'] = Lucene_accurate_scores
    
    query_ID = 'query' + str(i)
    results_per_query[query_ID] = df_per_query

In [64]:
df_query1 = results_per_query['query1']

In [66]:
df_query1.to_csv('./query1_results.csv')

### Results for query 2

In [73]:
for i in range(2,3):
    print(f'Starting computations for query: {i}')
    query_text = formatted_queries[i]
    index_name = '10k_query_' + str(i)
    
    b = 0.4
    k1 = 0.9
    N = 10000
    
    # ======== FIRST GET THE DATA THAT APPLIES TO THE ENTIRE CORPUS ========
    # get the original data from the es.search() method
    original_data = get_original_data(i)
    
    doc_ids_list = original_data[0]
    doc_original_scores = original_data[1]
    doc_original_contents = original_data[2]
    
    df_per_query = pd.DataFrame()
    df_per_query['doc_ids'] = doc_ids_list
    df_per_query['original_scores'] = doc_original_scores
    
    # Compute the average length of the documents:
    L_avg = compute_L_avg(index_name, doc_ids_list)
    
    # get the df-values for each term of the query (note: df values apply for the entire corpus of documents)
    [df_per_query_term, ttf_per_query_term] = get_df_ttf(query_text, index_name)
    
    # format the doc body for the es.termvectors() method
    doc_body = {
      "term_statistics" : True,
      "doc" : {
        "Content" : query_text
      }
    }
    
    # ======== ======== ======== ======== ======== ======== ======== ========
    
    # ======== NEXT GET THE DATA THAT IS COMPUTED PER DOCUMENT ========
   
    
    # for each document in the ranking of the query, re-compute the scores
    ATIRE_scores = []
    Lucene_accurate_scores = []
    
    for j in tqdm(range(len(doc_ids_list))):
        ATIRE_score_per_doc = 0
        Lucene_accurate_score_per_doc = 0
        
        docID = doc_ids_list[j]
        doc_content = doc_original_contents[j]
        
        # compute L_d
        L_d = compute_L_d(index_name, docID)
        
        # index the document content into a 'disposable index'
        es.indices.delete(index = 'disposable')
        es.indices.create(index = 'disposable', body = index_body)
        es.index(index = 'disposable', body = doc_content)

        disposable_result = es.termvectors(index = "disposable", body = doc_body)
        disposable_dict = disposable_result['term_vectors']['Content']['terms']
        
        for term in disposable_dict.keys():
            if term in df_per_query_term:
                df = df_per_query_term[term]
                ttf = ttf_per_query_term[term]
                
                tf = disposable_dict[term]['term_freq'] / ttf
    
                # Compute ATIRE score
                x1_ATIRE = np.log(N/df)
                x2_ATIRE = (tf*(k1+1)) / (tf + k1*(1 - b + b*(L_d/L_avg)))
                
                ATIRE_score_per_doc += x1_ATIRE * x2_ATIRE
                
                # Compute Lucene Accurate score
                x1_lucene = np.log(1+ (N - df + 0.5)/(df + 0.5))
                x2_lucene = tf / (tf + k1*(1 - b + b*(L_d/L_avg)))
                Lucene_accurate_score_per_doc += x1_lucene * x2_lucene
                
                
        
        # save the score per document:
        ATIRE_scores.append(ATIRE_score_per_doc) 
        Lucene_accurate_scores.append(Lucene_accurate_score_per_doc)
#         print(f"ATIRE_score_per_doc: {ATIRE_score_per_doc}, Lucene_accurate_score_per_doc: {Lucene_accurate_score_per_doc}")

#         es.indices.delete(index = 'disposable')
        
    df_per_query['ATIRE_scores'] = ATIRE_scores
    df_per_query['Lucene_accurate_scores'] = Lucene_accurate_scores
    
    query_ID = 'query' + str(i)
    results_per_query[query_ID] = df_per_query

Starting computations for query: 2


100%|██████████| 9999/9999 [2:51:48<00:00,  1.03s/it]  


In [77]:
df_query2 = results_per_query['query2']

In [79]:
df_query2.to_csv('./query2_results.csv')