# ClinicalTrial criterias matching by query terms

### Query Terms
- 以法國五病例為查詢關鍵字來源
- 檔案為 query_tokens.txt

### Criteria Content
- 以"COVID-19"搜尋ClinicalTrial的Search Results為資料來源
- 以 extract_label.ipynb 產生criteria資料夾下的檔案
- 以BM25計算各文件與query terms的相似性
- 依照分數排序取出前5名

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import re
import STree
from tqdm import tqdm
from collections import Counter
from gensim import corpora
from gensim.summarization import bm25
from nltk.stem.porter import PorterStemmer

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import Phrases

In [2]:
def load_query_tokens():
    lines = []
    with open('query_tokens.txt') as f:  
        for line in f:
            line = line.strip()
            if len(line) > 0:
                lines.append(line.lower())
    return lines

def load_query_texts():
    text = ''
    with open('query_tokens.txt') as f:
        lines = f.readlines()
        for line in lines:
            text = text + line.strip() + ' '
    return text

def load_all_ids():
    id_list = []
    
    with open('process_results/all_ids.txt') as f:  
        lines = f.readlines()
        for line in lines:
            id_list.append(line.strip())
            
    return id_list

def load_file(file):
    with open(file) as f:  
        txt = f.read()
        return txt

In [3]:
def get_criteria_by_id(nct_id):
    dataset_path = './criteria'
    file1 = nct_id + '.inc'
    file2 = nct_id + '.exc'
    file_path_1 = os.path.join(dataset_path,file1)
    file_path_2 = os.path.join(dataset_path,file2)
    inc_cri = load_file(file_path_1).replace('\n',' ')
    exc_cri = load_file(file_path_2).replace('\n',' ')
    
    content = (inc_cri,exc_cri)
    return content

def get_all_criteria():
    all_criteria = []
    for the_id in id_list:
        criteria = get_criteria_by_id(the_id)
        inc_criteria = criteria[0]
        exc_criteria = criteria[1]
        combine_criteria = inc_criteria + " " + exc_criteria
        all_criteria.append(combine_criteria)
    return all_criteria

In [4]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 1:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

### 讀取預處理好的檔案

In [5]:
id_list = load_all_ids()
query_texts = load_query_texts()

### query terms的處理

In [6]:
processed_query = preprocess(query_texts)
print(processed_query)

['fever', 'cough', 'conjunctivitis', 'diarrhoea', 'shortness', 'breath', 'white', 'blood', 'cell', 'neutrophil', 'lymphocyte', 'haemoglobin', 'platelet', 'prothrombin', 'time', 'albumin', 'creatinine', 'kinase', 'alanine', 'aspartate', 'bilirubin', 'sodium', 'potassium', 'urea', 'creatinine', 'reactive', 'protein', 'lactate', 'chest', 'ray']


### 取出所有 trials criteria

In [7]:
criteria_list = get_all_criteria()
print('共',len(criteria_list),'篇試驗\n')

共 181 篇試驗



In [8]:
documents = pd.DataFrame(list(zip(id_list,criteria_list)),columns=['id','criteria'])
documents['index'] = documents.index
documents[:10]

Unnamed: 0,id,criteria,index
0,NCT02735707,: 1. Adult patients admitted to an IC...,0
1,NCT03331445,- Written informed consent. ...,1
2,NCT03680274,1. Admitted to the intensive care ...,2
3,NCT03808922,"1. At the time of randomization, r...",3
4,NCT04244591,- Adult - PCR confirm...,4
5,NCT04251871,- Confirmed 2019-nCoV infection c...,5
6,NCT04252274,- The participants were diagnosed...,6
7,NCT04252664,1. Age >=18 years at time of signi...,7
8,NCT04254874,1)2019-nCoV nucleic acid test was po...,8
9,NCT04255017,1. 2019-nCoV nucleic acid test was...,9


In [9]:
processed_docs = documents['criteria'].map(preprocess)
processed_docs[:10]

0    [adult, patient, admit, icu, severe, cap, hour...
1    [write, inform, consent, previously, diagnose,...
2    [admit, intensive, care, unit, prove, suspect,...
3    [time, randomization, require, supplemental, o...
4    [adult, pcr, confirm, noval, coronavirus, infe...
5    [confirm, ncov, infection, case, term, laborat...
6    [participant, diagnose, pneumonia, cause, ncov...
7    [age, year, time, sign, inform, consent, form,...
8    [ncov, nucleic, acid, test, positive, ct, lung...
9    [ncov, nucleic, acid, test, positive, ct, lung...
Name: criteria, dtype: object

### 加入 bigram terms

In [10]:
bigram = Phrases(processed_docs, min_count=3)

for token in bigram[processed_query]:
    if '_' in token:
        print(token)
        processed_query.append(token)
        
print(processed_query)

for idx in range(len(processed_docs)):
    for token in bigram[processed_docs[idx]]:
        if '_' in token:
            #print(token)
            processed_docs[idx].append(token)

fever_cough
shortness_breath
chest_ray
['fever', 'cough', 'conjunctivitis', 'diarrhoea', 'shortness', 'breath', 'white', 'blood', 'cell', 'neutrophil', 'lymphocyte', 'haemoglobin', 'platelet', 'prothrombin', 'time', 'albumin', 'creatinine', 'kinase', 'alanine', 'aspartate', 'bilirubin', 'sodium', 'potassium', 'urea', 'creatinine', 'reactive', 'protein', 'lactate', 'chest', 'ray', 'fever_cough', 'shortness_breath', 'chest_ray']


### bm25模型

In [11]:
bm25Model = bm25.BM25(processed_docs)

average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())
print(average_idf)

3.88417682378272


### 分別計算每個 trial 的分數後，依分數排序取前5名

In [12]:
scores = bm25Model.get_scores(processed_query)
print(len(scores))
#print('scores :',scores)

documents['score'] = scores
print(documents[:10])

181
            id                                           criteria  index  \
0  NCT02735707  :          1. Adult patients admitted to an IC...      0   
1  NCT03331445              -  Written informed consent.      ...      1   
2  NCT03680274              1. Admitted to the intensive care ...      2   
3  NCT03808922              1. At the time of randomization, r...      3   
4  NCT04244591              -  Adult            -  PCR confirm...      4   
5  NCT04251871              -  Confirmed 2019-nCoV infection c...      5   
6  NCT04252274              -  The participants were diagnosed...      6   
7  NCT04252664              1. Age >=18 years at time of signi...      7   
8  NCT04254874            1)2019-nCoV nucleic acid test was po...      8   
9  NCT04255017              1. 2019-nCoV nucleic acid test was...      9   

       score  
0   0.000000  
1  12.990113  
2   0.000000  
3   6.489960  
4   0.000000  
5  15.768209  
6   0.000000  
7   4.688161  
8   0.000000  
9   0.000

### 顯示最佳配對結果

In [13]:
sorted_documents = documents.sort_values(by=['score'],ascending=False)
top5_documents = sorted_documents[:5]
#print(top5_documents)

top5_id = top5_documents['id']
print(top5_id)

134    NCT04331665
180    NCT04337359
66     NCT04313023
149    NCT04333472
5      NCT04251871
Name: id, dtype: object
