# ClinicalTrial criterias matching by query terms

### Query Terms
- 以法國五病例為查詢關鍵字來源
- 檔案為 query_tokens.txt

### Criteria Labels
- 以"COVID-19"搜尋ClinicalTrial的Search Results為資料來源
- 以 extract_label.ipynb 產生criteria和labels資料夾下的檔案
- 該程式主要部分以 COVID-19-proc.ipynb 改編

In [1]:
import sys
import os
import xml.etree.ElementTree as ET
import re
import STree
from tqdm import tqdm
from collections import Counter

In [2]:
def load_query_tokens():
    lines = []
    with open('query_tokens.txt') as f:  
        for line in f:
            line = line.strip()
            if len(line) > 0:
                lines.append(line.lower())
    return lines

In [3]:
def load_all_ids():
    with open('process_results/all_ids.txt') as f:  
        lines = f.readlines()
        return lines

In [4]:
def get_label_from_file(file):
    if os.path.isfile(file):
        with open(file,'r') as f:
            lines = f.readlines()
            return lines
    else:
        print(file + ' does not exist.')

In [5]:
def get_all_labels():
    all_contents = []
    
    dataset_path = './labels'
    for the_id in all_ids:
        the_id = the_id.strip()
        #print(the_id)
        
        file = the_id + '.txt'
        file_path = os.path.join(dataset_path,file)
        labels = get_label_from_file(file_path)        
        content = (the_id,labels)
        all_contents.append(content)

    return all_contents

In [6]:
def check_token_exist(token):
    id_list = []
    for tp in label_list:
        a_id = tp[0]
        labels = tp[1]
        for lab in labels:
            if token in lab:
                id_list.append(a_id)
                break
    return id_list

In [7]:
def check_token_in_trial(token,labels):
    for lab in labels:
        if token in lab:
            return True
    
    return False

In [8]:
def load_file(file):
    with open(file) as f:  
        txt = f.read()
        return txt

In [9]:
def get_criteria_by_id(nct_id):
    dataset_path = './criteria'
    file1 = nct_id + '.inc'
    file2 = nct_id + '.exc'
    file_path_1 = os.path.join(dataset_path,file1)
    file_path_2 = os.path.join(dataset_path,file2)
    inc_cri = load_file(file_path_1)
    exc_cri = load_file(file_path_2)
    content = (inc_cri,exc_cri)
    return content

In [10]:
def get_query_tokens_by_feature_array(arr):
    f = ''
    for i,v in enumerate(arr):
        if v == 1:
            f = f + query_tokens[i] + ', '
    
    return f

In [11]:
def get_labels_by_id(nct_id):
    file_path = './labels/' + nct_id + '.txt'
    lab = load_file(file_path)
    return lab

### 讀取預處理好的檔案

In [12]:
all_ids = load_all_ids()
label_list = get_all_labels()
query_tokens = load_query_tokens()

### 分別計算 query token 出現在幾個 trial 中

In [13]:
query_results = []

for i,t in enumerate(query_tokens):
    match_list = check_token_exist(t)
    print(i,":",t," => ", len(match_list))
    query_results.append(match_list)   

0 : fever  =>  7
1 : cough  =>  5
2 : conjunctivitis  =>  0
3 : diarrhoea  =>  0
4 : shortness of breath  =>  0
5 : white blood cell  =>  0
6 : neutrophil  =>  0
7 : lymphocyte  =>  8
8 : haemoglobin  =>  0
9 : platelet  =>  0
10 : prothrombin time  =>  0
11 : albumin  =>  0
12 : creatinine kinase  =>  0
13 : alanine aminotransferase  =>  14
14 : aspartate aminotransferase  =>  16
15 : total bilirubin  =>  12
16 : sodium  =>  0
17 : potassium  =>  0
18 : urea  =>  0
19 : creatinine  =>  20
20 : c-reactive protein  =>  0
21 : lactate  =>  0
22 : chest x-ray  =>  11
23 : bilateral pneumonia  =>  1


### 分別計算每個 trial 的分數後，依分數排序取前5名

In [14]:
score_result = []

for tp in label_list:
    feature_array = []
    
    the_id = tp[0]
    labels = tp[1]
    
    for i,token in enumerate(query_tokens):
        if check_token_in_trial(token,labels) == True:
            feature_array.append(1)
        else:
            feature_array.append(0)
    
    score_result.append((the_id,sum(feature_array),feature_array))

In [15]:
score_result.sort(key = lambda s: s[1], reverse = True)

for i in range(5):
    r = score_result[i]
    c = get_criteria_by_id(r[0])
    t = get_query_tokens_by_feature_array(r[2])
    l = get_labels_by_id(r[0])
    
    print(r[0]," => ",r[1])
    print(r[2],"\n")

NCT04335305  =>  5
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0] 

NCT04260594  =>  4
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0] 

NCT04331665  =>  4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0] 

NCT04334460  =>  4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0] 

NCT03808922  =>  3
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0] 



### 檢視單一試驗的內容

In [16]:
case_id = 'NCT04335305'
feature_array = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0] 

c = get_criteria_by_id(case_id)
t = get_query_tokens_by_feature_array(feature_array)
l = get_labels_by_id(case_id)

print(t)
print('---')
print(l)
print('---')
print(c[0])
print('---')
print(c[1])
print('\n\n')

lymphocyte, alanine aminotransferase, aspartate aminotransferase, creatinine, chest x-ray, 
---
[7, 'informed consent form', 'compliant = yes']
[33, 'prior to', 'time = before']
[42, 'participati', 'compliant = yes']
[99, 'pregnant', 'feature = pregnant']
[115, 'patients', 'connector']
[124, '>= 18 years', 'age >= 18 years old']
[127, '18 years', 'value = 18 years old']
[139, 'the time of', 'feature = time']
[163, 'laboratory confirm', 'value = yes | modifier = laboratory']
[174, 'confirmed covid-19', 'covid19 = yes']
[184, 'covid-19 infection', 'feature = covid19']
[193, 'infection', 'feature = infection']
[216, 'positive', 'value = yes']
[273, 'rt-pcr from any', 'modifier = rtpcr']
[285, 'any specimen', 'value = yes']
[289, 'specimen', 'feature = specimen']
[315, 'of sars-cov-2', 'covid19 = yes']
[375, 'confirmation of', 'value = yes']
[388, 'of pneumonia', 'pneumonia = yes']
[391, 'pneumonia', 'feature = pneumonia']
[411, 'chest x-ray', 'feature = xray']
[474, 'acute respiratory', '