# Read the csv file

In [1]:
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer, util
import pickle
import torch

> This is for initial analysis you can go to "Process stage 1 results and go to stage 2."

In [2]:

csv_file_name = 'elasticsearch_files/test_yn_large_mod_which_scispacy.csv' 
trials_df = pd.read_csv(csv_file_name, 
                 dtype={'#ncd_id':str, 
                        'title':str, 
                        'has_us_facility':bool, 
                        'conditions': str, 
                        'eligibility_criteria': str, 
                        'gender':str, 
                        'minimum_age':str, 
                        'maximum_age':str},
                 )

trials_df = trials_df.rename(columns={'#ncd_id':'nct_id'})
trials_df.index = trials_df.iloc[:,0] # index by nct_icds

In [3]:
print(f'Total clinical trials: {len(trials_df)}')

Total clinical trials: 451538


In [4]:
print(trials_df.columns)

Index(['nct_id', 'title', 'has_us_facility', 'conditions',
       'eligibility_criteria', 'gender', 'minimum_age', 'maximum_age', 'no',
       'yes'],
      dtype='object')


# Stage 1 Results (used to get stub results for stage 2)

In [9]:
disorders_cui = [{
    'condition': 'rheumatoid arthritis', # topic 26-30
    'condition_cui': 'C0003873'}]

def compile_queries_for_stage_1_ranking():
    """return trials if disorder present in both the 'condition' column and disorder CUI in 'yes'  
    i.e. (negated concept in exclusion + concept in inclusion)
    
    returns: Tuple(condition, query)
    """

    for item in disorders_cui:
        yield (item['condition'], f"yes.str.contains('{item['condition_cui']}') &  \
            conditions.str.lower().str.contains('{item['condition']}')")


In [10]:
stage1_query_tuples = list(compile_queries_for_stage_1_ranking())

def search_queries(df,query_tuples):
    for query in query_tuples:
        cond,q = query
        yield {'condition':cond, 'relevant_nct_ids': df.query(q).iloc[:,0].tolist()}
    

In [11]:
# we only have 1 query for now,

results = list(search_queries(trials_df,stage1_query_tuples))
    

In [12]:
def itemize(criteria: str):
    """Split the inclusion/exclusion criteria into individual items"""
    if criteria is None:
        return None
    else:
        # Regex spliting rules:
        # {1 or more blank space}{'-' character}{1 or 2 blank space}
        # OR
        # {1 or more blank space}{1 or 2 digit/letter}{'.' character}{one or more blank space}
        items = re.split(r'\s+-\s{1,2}|\s+\w{1,2}\.\s+', criteria)[1:]
        # strip leading and trailing space and remove extra spaces for each item.
        clean_items = [re.sub('\s+', ' ', item.strip()) for item in items]
        return clean_items


def itemize_criteria(eligibility: str):
    """Important: We assume for a trial both inclusion and exclusion are given, 
    If any trail does not include both inclusion and exclusion both it is returned as NONE.

    The total number of criteria w/o inclusion and exclusion is considered to be minimal and 
    should not effect the outcome much.
    """

    regex = 'inclusion criteria([.\w\W]*)exclusion criteria([.\w\W]*)'
    match = re.search(regex, eligibility, flags=re.IGNORECASE)
    inc, exc = match.groups() if match else (None, None)

    return {'inclusion': itemize(inc), 'exclusion': itemize(exc)}
    

In [13]:
def get_relevant_trials_inclusion_exclusion(trials_df,stage1_results):
    for output in stage1_results:
        
        relevant_ids = output['relevant_nct_ids']
        relavant_trial = trials_df.loc[relevant_ids]
        relevant_trials = relavant_trial.apply(lambda x: {'nct_id':x.nct_id}|itemize_criteria(x.eligibility_criteria),axis=1)
        
        return relevant_trials.tolist()
    

In [14]:
expected_output_stage1 = get_relevant_trials_inclusion_exclusion(trials_df,results)
expected_output_stage1 = list(filter(lambda x: x['inclusion'] is not None and len(x['inclusion']) > 0 ,expected_output_stage1))


Expected output format from stage 1 of retrieval for each topic.

In [18]:
expected_output_stage1 # list of dicts

[{'nct_id': 'NCT04227535',
  'inclusion': ['Age ≥18 years',
   'RA diagnosis according to ACR (classification of rheumatoid arthritis) 1987 and/or ACR/EULAR 2010 criteria, validated by a senior rheumatologist',
   'ILD diagnosis based on typical High-Resolution Computed-Tomography (HCRT) patterns and pulmonary function tests (PFT), validated by a senior pulmonologist Exclusion Criteria:',
   'Lack of signed informed consent SELECTION OF CONTROLS: Inclusion Criteria:',
   'Age ≥18 years',
   'RA diagnosis according to ACR 1987 and/or ACR/EULAR 2010 criteria, validated by a senior rheumatologist'],
  'exclusion': ['Lack of signed informed consent']},
 {'nct_id': 'NCT04226131',
  'inclusion': ['Rheumatoid arthritis participants will all be seropositive (positive rheumatoid factor or anti-citrullinated protein antibody) or with erosions typical of RA on radiographs.',
   'Meet 2010 American College of Rheumatology/European League Against Rheumatism (ACR/EULAR) Classification Criteria for',

In [20]:
expected_output_stage1[0].keys()

dict_keys(['nct_id', 'inclusion', 'exclusion'])

# 1. Process the results and prepare for stage 2

In [2]:
# read from pickle file

file_name = 'final_results.p'

with open(file_name, 'rb') as file_name:
    final_results = pickle.load(file_name)
    
stage1_results = pd.DataFrame(final_results)
stage1_results = stage1_results.groupby('patient_id')

 These are the inclusion/ Exclusion criteria for the 'rheumatoid arthritis', which is shown by topic 26-30

In [3]:
# read the topic sentences.
sentified_topics = pd.read_csv('clean_topic_sentences.csv')
sentified_topics.sentences = sentified_topics.sentences.apply(eval) # reading the str as python lists

def get_topic_sentences_from_disorder(topic_sentence_df,disorder):
    return topic_sentence_df[(topic_sentence_df.Template == disorder)]

# since our sample stage1 results are for rheumatoid arthritis
# relevant_patient_topic = get_topic_sentences_from_disorder(sentified_topics,results[0]['condition'])

In [4]:
sentified_topics.loc[:,'patient_id'] = sentified_topics.loc[:,'Topic Number']
sentified_topics.index = sentified_topics.patient_id


In [5]:
def weighted_relevance_scoring(inclusion_matrix, exclusion_matrix=None):
    """Exclusion weighted more than inclusion criteria, takes the maximum value in exclusion matrix
    and subtracts from average cosine similarities in inclusion matrix
    Negative score indicate topic sentences matched exclusion more than inclusion."""

    # inclusion.shape: topic_sentences x inclusion_sentences
    # exclusion.shape: topic_sentences x exclusion_sentences
    exclusion_score = 0
    inclusion_score = 0
    # Average inclusion similarity
    if inclusion_matrix is not None and inclusion_matrix.shape[1] > 0:
        inclusion_score = inclusion_matrix.mean()

    if exclusion_matrix is not None and exclusion_matrix.shape[1] > 0:
        # Maximum exclusion similarity
        exclusion_score = exclusion_matrix.max()
    combined_score = inclusion_score - exclusion_score
    return combined_score


def balanced_relevance_scoring(inclusion_matrix, exclusion_matrix):
    """Negative score indicate topic sentences matched exclusion more than inclusion."""
    
    
    # inclusion.shape: topic_sentences x inclusion_sentences
    # exclusion.shape: topic_sentences x exclusion_sentences
    exclusion_score = 0
    inclusion_score = 0
    # Average inclusion similarity
    if inclusion_matrix is not None and inclusion_matrix.shape[1] > 0:
        inclusion_score = inclusion_matrix.mean()

    if exclusion_matrix is not None and exclusion_matrix.shape[1] > 0:
        # mean exclusion similarity
        exclusion_score = exclusion_matrix.mean()
    combined_score = inclusion_score - exclusion_score
    return combined_score

def naive_high_precision_scoring(inclusion_matrix, exclusion_matrix=None, threshold=0.5):
    """Score is computed as follows:
    1. For each inclusion criteria, if it is satisfied by one or more topic sentence then it is considered as satisfied.
    2. Each satisfied inclusion criteria is given a score of 1.
    3. Final score is the normalized inclusion score (0-1) + exclusion score (if any exclusion criteria is satisfied then -1 else 0).
    # Negative score indicates that the trial is excluded because of one or more exclusion criteria.
    """

    # inclusion.shape: topic_sentences x inclusion_sentences
    # exclusion.shape: topic_sentences x exclusion_sentences

    exclusion_score = 0
    inclusion_score = 0
    if exclusion_matrix is not None and exclusion_matrix.shape[1] > 0:
        if (exclusion_matrix > threshold).any():  # any_exclusion_matched
            exclusion_score = -1
    # satisfy all inclusion criteria atleast once
    if inclusion_matrix is not None and inclusion_matrix.shape[1] > 0:
        inclusion_satisfied_atleast_once = (inclusion_matrix > threshold).any(1)
        inclusion_score = np.array(inclusion_satisfied_atleast_once).mean()

    final_score = inclusion_score + exclusion_score

    return torch.tensor([final_score])


def compute_score_matrix(model, topic_sent, inclusion_sent, exclusion_sent):

    topic_encoding = model.encode(topic_sent)
    
    cosine_matrix_t_e = None
    cosine_matrix_t_i = None
    
    if inclusion_sent:
        inc_encoding = model.encode(inclusion_sent)
        cosine_matrix_t_i = util.cos_sim(topic_encoding, inc_encoding)
        
    
    if exclusion_sent:  # len = 0
        exc_encoding = model.encode(exclusion_sent)
        cosine_matrix_t_e = util.cos_sim(topic_encoding, exc_encoding)
    

    return {'inclusion_matrix': cosine_matrix_t_i, 'exclusion_matrix': cosine_matrix_t_e}


In [6]:
### sorting does not work  fix later
def rank_trials(model, score_functions, topic_sent, stage1_output):
    output = {}
    trial_scores = {func.__name__: [] for func in score_functions}
    for item in tqdm(stage1_output):
        # compute a score for each NCT_ID
        nct_id, inc_sents, exc_sents,p_id =  item['ncd_id'],item['inclusion'] ,item['exclusion'],item['patient_id']
        score_matrix = compute_score_matrix(model,topic_sent,inc_sents, exc_sents)
        
        scores = {func.__name__ :func(**score_matrix).item() for func in score_functions}
        
        
    
    # trial_scores.append({'nct_id': nct_id, 'topic_no':p_id}|scores)
    for func in score_functions:
        # trial_scores[func.__name__].append({'nct_id': nct_id, 'topic_no':p_id, f'{func.__name__}':})
        trial_scores.sort(key=lambda x: x[func.__name__], reverse=True)
        output[func.__name__] = trial_scores
        
    return output
   



In [8]:
zero_idx = pd.DataFrame([{'inc_len': len(i['inclusion']),'exc_len':len(i['exclusion'])} for i in final_results ]).query('inc_len==0').index

In [58]:
zero_idx.values

array([14035, 16150, 17988, 22091, 39979, 41088])

In [7]:
score_functions_list = [weighted_relevance_scoring, balanced_relevance_scoring, naive_high_precision_scoring]
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
output = {func.__name__:[] for func in score_functions_list}

for p_id in range(1,41):        
    candiate_trials_stage_1 = stage1_results.get_group(p_id).to_dict('records')
    topic_sent = sentified_topics.sentences.loc[p_id]
    ranked_trials =  rank_trials(model= model,
                                        score_functions= score_functions_list,
                                        topic_sent=topic_sent, 
                                        stage1_output=candiate_trials_stage_1)
    
    
    # ranked_trials by different scoring functions
    for func in score_functions_list:
        ranked_trials_by_score = ranked_trials[func.__name__] # lists
        # normalization
        for i in ranked_trials_by_score:
            i[func.__name__]  = (i[func.__name__] + 1)/2

        output[func.__name__] += ranked_trials_by_score
    


100%|██████████| 447/447 [00:08<00:00, 53.01it/s]
100%|██████████| 414/414 [00:06<00:00, 62.11it/s]
100%|██████████| 438/438 [00:06<00:00, 62.62it/s]
100%|██████████| 409/409 [00:06<00:00, 62.96it/s]
100%|██████████| 425/425 [00:06<00:00, 62.82it/s]
100%|██████████| 754/754 [00:12<00:00, 60.76it/s]
100%|██████████| 772/772 [00:12<00:00, 61.10it/s]
100%|██████████| 603/603 [00:09<00:00, 61.38it/s]
100%|██████████| 732/732 [00:11<00:00, 61.47it/s]
100%|██████████| 610/610 [00:09<00:00, 61.46it/s]
100%|██████████| 1705/1705 [00:27<00:00, 61.94it/s]
100%|██████████| 1000/1000 [00:16<00:00, 59.14it/s]
100%|██████████| 1742/1742 [00:28<00:00, 61.95it/s]
100%|██████████| 1757/1757 [00:28<00:00, 62.23it/s]
100%|██████████| 1000/1000 [00:16<00:00, 59.35it/s]
100%|██████████| 2050/2050 [00:36<00:00, 56.53it/s]
100%|██████████| 2113/2113 [00:35<00:00, 59.46it/s]
100%|██████████| 1954/1954 [00:33<00:00, 57.95it/s]
100%|██████████| 2139/2139 [00:36<00:00, 59.05it/s]
100%|██████████| 2002/2002 [00:3

# Trec Submission format


The format for run submissions follows the standard trec_eval format. Each line of the submission file should follow the form:

TOPIC_NO Q0 ID RANK SCORE RUN_NAME



where TOPIC_NO is the topic number (1–30), 0 is a required but ignored constant, ID is the identifier of the retrieved document (PMID or NCT ID), RANK is the rank (1–1000) of the retrieved document, SCORE is a floating point value representing the confidence score of the document, and RUN_NAME is an identifier for the run. The RUN_NAME is limited to 12 alphanumeric characters (no punctuation).

The file is assumed to be sorted numerically by TOPIC_NO, and SCORE is assumed to be greater for documents that should be retrieved first. For example, the following would be a valid line of a run submission file:

1 Q0 NCT00760162 1 0.9999 my-run
The above line indicates that the run named "my-run" retrieves for topic number 1 document NCT00760162 at rank 1 with a score of 0.9999.

In [38]:
output_df = pd.DataFrame(output['naive_high_precision_scoring'])
output_df = output_df.groupby('topic_no')
result_df = []
for i in range(1,41):
    result_df.append(output_df.get_group(i).sort_values(by='naive_high_precision_scoring',ascending=False))

In [28]:
def format_results_for_trec(outputs,score_key,run_name):
    ranked_df = outputs.groupby('topic_no')
    with open(f'{run_name}.txt','a') as f:
        for p_id in range(1,41):
            first1000 = ranked_df.get_group(p_id).iloc[:1000]
            for i, row in enumerate(first1000.iterrows()):
                row = row[1]
                line = f"{row['topic_no']} Q0 {row['nct_id']} {i+1} {row[score_key]:f>.3} {run_name}"
                # f.write(line)
                print(line,file=f)



In [68]:
pd.DataFrame(output['naive_high_precision_scoring']).sort_values(
    by=['topic_no','naive_high_precision_scoring'],
    ascending=[True,False])

Unnamed: 0,nct_id,topic_no,weighted_relevance_scoring,balanced_relevance_scoring,naive_high_precision_scoring
0,NCT03903588,1,0.636710,0.636710,0.928571
1,NCT00141882,1,0.680533,0.680533,0.785714
2,NCT00168350,1,0.680533,0.680533,0.785714
3,NCT04864834,1,0.663815,0.663815,0.785714
4,NCT00071773,1,0.639267,0.639267,0.785714
...,...,...,...,...,...
51902,NCT04151836,40,0.247894,0.447423,0.000000
51903,NCT04445181,40,0.326646,0.445102,0.000000
51904,NCT04029298,40,0.242694,0.439459,0.000000
51905,NCT01250639,40,0.218922,0.434591,0.000000


In [39]:
pd.concat(result_df)

Unnamed: 0,nct_id,topic_no,weighted_relevance_scoring,balanced_relevance_scoring,naive_high_precision_scoring
0,NCT03903588,1,0.636710,0.636710,0.928571
2,NCT00168350,1,0.680533,0.680533,0.785714
3,NCT04864834,1,0.663815,0.663815,0.785714
4,NCT00071773,1,0.639267,0.639267,0.785714
5,NCT00069056,1,0.625819,0.625819,0.785714
...,...,...,...,...,...
51876,NCT04702490,40,0.301562,0.497636,0.000000
51877,NCT01819597,40,0.293521,0.493993,0.000000
51878,NCT04215536,40,0.236693,0.493031,0.000000
51879,NCT00422357,40,0.221717,0.492814,0.000000


In [40]:
format_results_for_trec(pd.concat(result_df),'naive_high_precision_scoring',run_name = 'nr-ema3')

In [58]:
sentified_topics

Unnamed: 0_level_0,Topic Number,Template,sentences,patient_id
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,glaucoma,[Definitive diagnosis: The patient has primary...,1
2,2,glaucoma,[Definitive diagnosis: The patient has pigment...,2
3,3,glaucoma,[Definitive diagnosis: The patient has uveitic...,3
4,4,glaucoma,[Definitive diagnosis: The patient has primary...,4
5,5,glaucoma,[Definitive diagnosis: The patient has neovasc...,5
6,6,anxiety,[Definitive diagnosis: The patient has a defin...,6
7,7,anxiety,[Definitive diagnosis: The patient has a defin...,7
8,8,anxiety,[Definitive diagnosis: The patient does not ha...,8
9,9,anxiety,[Definitive diagnosis: The patient has a defin...,9
10,10,anxiety,[Definitive diagnosis: The patient has a defin...,10


In [59]:
sentified_topics.sentences.loc[10]

['Definitive diagnosis: The patient has a definitive diagnosis of anxiety.',
 'Age: The patient is 58 years old.',
 'Proficient languages: The patient is proficient in English and Turkish.',
 "SSASI: The patient's SSASI score was not specified.",
 'HAM-A: The patient scored 12 on the HAM-A.',
 "PHQ-9: The patient's PHQ-9 score was not specified.",
 'HAM-D: The patient scored 19 on the HAM-D.',
 'GAD-7: The patient scored 9 on the GAD-7.',
 'Beck Depression Inventory: The patient scored 8 on the Beck Depression Inventory.',
 'Suicidal ideation: The patient does not report experiencing suicidal ideation.',
 'Dementia: The patient has dementia']

In [65]:
list(filter(lambda x: x['ncd_id'] == 'NCT05274620',final_results))[0]

{'inclusion': ['Patient who have been identified as depressed or anxious by a care provider in one of the participating healthcare organizations.'],
 'exclusion': ['None '],
 'ncd_id': 'NCT05274620',
 'patient_id': 6}