In [None]:
pip install git+https://github.com/noc-lab/simple_sentence_segment.git

We highly recommend our [sentence segment tool](https://github.com/noc-lab/simple_sentence_segment) for detecting sentence boundary if the text contains arbitrary line breaks, such as the sample text in the following. To use this package, just run
```
pip install git+https://github.com/noc-lab/simple_sentence_segment.git
```
Alternatively, you can use the sentence segmentation tool in NLTK or Spacy. Also, you can use other tokenization tools than NLTK. But this example uses NTLK for the illustrative purpose.

## 1. simple_sentence_segment & Library Install

In [11]:
def parse_text(text):
    # Perform sentence segmentation, tokenization and return the lists of tokens,
    # spans, and text for every sentence respectively
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    all_sentences = []
    all_spans = []
    start = 0
    normalized_text = ''
    for span in sentence_segment(text):
        sentence = text[span[0]:span[1]]
        sentence = re.sub('\n', ' ', sentence)
        sentence = re.sub(r'\ +', ' ', sentence)
        sentence = sentence.strip()

        if len(sentence) > 0:
            tokens_span = tokenizer.span_tokenize(sentence)
            tokens = []
            spans = []
            for span in tokens_span:
                tokens.append(sentence[span[0]:span[1]])
                spans.append([start + span[0], start + span[1]])
                
            all_sentences.append(tokens)
            all_spans.append(spans)
            
            start += len(sentence) + 1
            normalized_text += sentence + '\n'
    return all_sentences, all_spans, normalized_text.strip()

In [12]:
def build_display_elements(tokens, annotations, spans):
    # convert the annotations to the format used in displacy
    all_ann = []

    for sent_id, sent_info in enumerate(tokens):
        sent_length = len(tokens[sent_id])

        last_ann = 'O'
        last_start = None
        last_end = None
        for token_id in range(sent_length):
            this_ann = annotations[sent_id][token_id]

            # separated cases:
            if this_ann != last_ann:
                if last_ann != 'O':
                    # write last item
                    new_ent = {}
                    new_ent['start'] = last_start
                    new_ent['end'] = last_end
                    new_ent['label'] = last_ann[2:]
                    all_ann.append(new_ent)

                # record this instance
                last_ann = 'O' if this_ann == 'O' else 'I' + this_ann[1:]
                last_start = spans[sent_id][token_id][0]
                last_end = spans[sent_id][token_id][1]

            else:
                last_ann = this_ann
                last_end = spans[sent_id][token_id][1]

        if last_ann != 'O':
            new_ent = {}
            new_ent['start'] = last_start
            new_ent['end'] = last_end
            new_ent['label'] = last_ann[2:]
            all_ann.append(new_ent)

    return all_ann

In [27]:
import nltk
import re
import os

from spacy import displacy
from IPython.core.display import display, HTML
from simple_sentence_segment import sentence_segment

from elasticsearch import Elasticsearch
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd

In [14]:
#CCE_ASSETS = '/home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets'
os.chdir('/home2/dalya/clinical_concept_extraction/')

from clinical_concept_extraction import clinical_concept_extraction
os.chdir('/home2/dalya/clinical_concept_extraction/ukyoung')

### 2. Documents-100 Load

In [31]:
### 2-1. TEST Docs
path = "/home2/ukyoung/my-python/trec-2022/12.02/output_2/"
docs = [f for f in os.listdir(path) if not f.startswith('.')] 

In [32]:
docs

['NCT00811187.txt',
 'NCT00813215.txt',
 'NCT00818467.txt',
 'NCT00815750.txt',
 'NCT00819078.txt',
 'NCT00813410.txt',
 'NCT00812682.txt',
 'NCT00811889.txt',
 'NCT00818038.txt',
 'NCT00817713.txt',
 'NCT00812630.txt',
 'NCT00815139.txt',
 'NCT00814268.txt',
 'NCT00814762.txt',
 'NCT00811096.txt',
 'NCT00816387.txt',
 'NCT00818181.txt',
 'NCT00814372.txt',
 'NCT00816751.txt',
 'NCT00816153.txt',
 'NCT00815152.txt',
 'NCT00814359.txt',
 'NCT00816465.txt',
 'NCT00811954.txt',
 'NCT00811681.txt',
 'NCT00819793.txt',
 'NCT00811460.txt',
 'NCT00815386.txt',
 'NCT00810563.txt',
 'NCT00818493.txt',
 'NCT00817492.txt',
 'NCT00814853.txt',
 'NCT00811707.txt',
 'NCT00817011.txt',
 'NCT00818233.txt',
 'NCT00813748.txt',
 'NCT00819273.txt',
 'NCT00814619.txt',
 'NCT00812565.txt',
 'NCT00814307.txt',
 'NCT00817063.txt',
 'NCT00817843.txt',
 'NCT00812097.txt',
 'NCT00814827.txt',
 'NCT00814151.txt',
 'NCT00818090.txt',
 'NCT00816933.txt',
 'NCT00814593.txt',
 'NCT00814944.txt',
 'NCT00810810.txt',


### 3. ClinicalBert Term 추출 

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Text -> Term Lists
def get_clinicalTerms(text: str):

    tokenized_sentences, all_spans, normalized_text = parse_text(text)
    all_annotations = clinical_concept_extraction(tokenized_sentences)
    ent = build_display_elements(tokenized_sentences, all_annotations, all_spans)

    problems, tests, treatments = [], [], []

    for e in ent:
        text = normalized_text[e['start']:e['end']]
        if e['label'] == 'problem':
            problems.append(text)
        elif e['label'] == 'test':
            tests.append(text)
        elif e['label'] == 'treatment':
            treatments.append(text)
            
    len_problems, len_tests, len_treatments = len(problems), len(tests), len(treatments)
    problems = '_'.join(filter(lambda x: x if x is not None else '', problems))
    tests = '_'.join(filter(lambda x: x if x is not None else '', tests))
    treatments = '_'.join(filter(lambda x: x if x is not None else '', treatments))
    
    return [len_problems, problems, len_tests, tests, len_treatments, treatments]

In [None]:
import pandas as pd
def df_init():
    df = pd.DataFrame({'ncd_id': [], 
                             'title_len_problems': [], 'title_problems': [], 'title_len_tests': [], 'title_tests': [], 'title_len_treatments': [], 'title_treatments': [],
                             'summary_len_problems': [], 'summary_problems': [], 'summary_len_tests': [], 'summary_tests': [], 'summary_len_treatments': [], 'summary_treatments': [],
                             #'desc_len_problems': [''], 'desc_problems': [''], 'desc_len_tests': [''], 'desc_tests':[''], 'desc_len_treatments': [''], 'desc_treatments': [''],
                             #'condition_len_problems': [''], 'condition_problems': [''], 'condition_len_tests': [''], 'condition_tests': [''], 'condition_len_treatments': [''], 'condition_treatments': [''],
                             'incriteria_len_problems': [], 'incriteria_problems': [], 'incriteria_len_tests': [], 'incriteria_tests': [], 'incriteria_len_treatments': [], 'incriteria_treatments': [],
                             'excriteria_len_problems': [], 'excriteria_problems': [], 'excriteria_len_tests': [], 'excriteria_tests': [], 'excriteria_len_treatments': [], 'excriteria_treatments': []},
                        )
    return df
df_output = df_init()

In [None]:
#import tensorflow as tf
#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.Error)

#from nltk.tokenize import word_tokenize
import xml.etree.ElementTree as ET 
f_path = "/home2/dalya/TREC/CTs-processed-v1"
#f_path = "/home2/ukyoung/my-python/TREC/문서전처리_0712/CTs-processed-v1/"

current_docs = []
csv_cnt = 2
# range: 50까진 어캐 됨
for idx in range(50, len_total + 1):
    
    if idx % 100 == 0:
        df_output.to_csv(f'docs/clinicalTerms_docs_{current_docs}.csv', header=True, index=False)
        current_docs += 1
        df_output = df_init()
    
    #print(f"------IDX: {idx} Started.. ------")
    doc_path = f_path + f_list[idx][1:-1]
    with open(doc_path, 'r', encoding='UTF8') as i_f:
        doc = ET.parse(i_f)
        doc = doc.getroot()
        
        # Document ID
        ncd_id = str(doc.find('nct_id').text)
        
        # Titles
        title = str(doc.find('brief_title').text)
        title_len_problems, title_problems, title_len_tests, title_tests, title_len_treatments, title_treatments = get_clinicalTerms(title)
        
        # Summaries
        summary = str(doc.find('brief_summary').text)
        summary_len_problems, summary_problems, summary_len_tests, summary_tests, summary_len_treatments, summary_treatments = get_clinicalTerms(summary)
        
        # Descriptions
        #description = ""#str(doc.find('detailed_description').text)
        #desc_len_problems, desc_problems, desc_len_tests, desc_tests, desc_len_treatments, desc_treatments = get_clinicalTerms(description)
        
        # Conditions
        #condition = str(doc.find('condition').text)
        #condition_len_problems, condition_problems, condition_len_tests, condition_tests, condition_len_treatments, condition_treatments = get_clinicalTerms(condition)
        
        in_criteria = str(doc.find('inclusion_criteria').text)
        incriteria_len_problems, incriteria_problems, incriteria_len_tests, incriteria_tests, incriteria_len_treatments, incriteria_treatments = get_clinicalTerms(in_criteria)
        
        ex_criteria = str(doc.find('exclusion_criteria').text)
        excriteria_len_problems, excriteria_problems, excriteria_len_tests, excriteria_tests, excriteria_len_treatments, excriteria_treatments = get_clinicalTerms(ex_criteria)
        
        df_output.loc[len(df_output)] = [ncd_id, 
                             title_len_problems, title_problems, title_len_tests, title_tests, title_len_treatments, title_treatments,
                             summary_len_problems, summary_problems, summary_len_tests, summary_tests, summary_len_treatments, summary_treatments,
                             #desc_len_problems, desc_problems, desc_len_tests, desc_tests, desc_len_treatments, desc_treatments,
                             #condition_len_problems, condition_problems, condition_len_tests, condition_tests, condition_len_treatments, condition_treatments,
                             incriteria_len_problems, incriteria_problems, incriteria_len_tests, incriteria_tests, incriteria_len_treatments, incriteria_treatments,
                             excriteria_len_problems, excriteria_problems, excriteria_len_tests, excriteria_tests, excriteria_len_treatments, excriteria_treatments
                            ]
    

In [43]:
did_list = []
problems_list = []
tests_list = []
treatments_list = []

for i in tqdm(range(len(docs))):
#for i in range(1):
    
    with open(path + docs[i], "r") as f:
        text = f.read()
    
    tokenized_sentences, all_spans, normalized_text = parse_text(text)
    all_annotations = clinical_concept_extraction(tokenized_sentences)
    ent = build_display_elements(tokenized_sentences, all_annotations, all_spans)
    
    problems = []
    tests = []
    treatments = []

    for e in ent:
        text = normalized_text[e['start']:e['end']]
        if e['label'] == 'problem':
            problems.append(text)
        elif e['label'] == 'test':
            tests.append(text)
        elif e['label'] == 'treatment':
            treatments.append(text)
    
    did_list.append(docs[i][:-4])
    problems_list.append('_'.join(problems))
    tests_list.append('_'.join(tests))
    treatments_list.append('_'.join(treatments))

  0%|          | 0/100 [00:00<?, ?it/s]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  1%|          | 1/100 [00:25<42:37, 25.83s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  2%|▏         | 2/100 [01:06<56:10, 34.39s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  3%|▎         | 3/100 [01:38<54:10, 33.51s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  4%|▍         | 4/100 [02:16<56:27, 35.29s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  5%|▌         | 5/100 [03:19<1:11:41, 45.28s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  6%|▌         | 6/100 [03:52<1:04:26, 41.13s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  7%|▋         | 7/100 [04:24<58:58, 38.05s/it]  

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  8%|▊         | 8/100 [05:35<1:14:16, 48.44s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


  9%|▉         | 9/100 [06:33<1:18:04, 51.47s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 10%|█         | 10/100 [07:18<1:14:32, 49.69s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 11%|█         | 11/100 [07:52<1:06:35, 44.89s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 12%|█▏        | 12/100 [08:44<1:08:34, 46.75s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 13%|█▎        | 13/100 [09:20<1:03:22, 43.71s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 14%|█▍        | 14/100 [10:14<1:07:09, 46.85s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 15%|█▌        | 15/100 [11:06<1:08:29, 48.35s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 16%|█▌        | 16/100 [11:34<59:13, 42.30s/it]  

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 17%|█▋        | 17/100 [11:59<51:12, 37.02s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 18%|█▊        | 18/100 [12:40<52:22, 38.32s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 19%|█▉        | 19/100 [13:07<46:45, 34.63s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 20%|██        | 20/100 [13:33<42:52, 32.15s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 21%|██        | 21/100 [14:30<52:13, 39.67s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 22%|██▏       | 22/100 [15:10<51:40, 39.75s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 23%|██▎       | 23/100 [15:42<47:52, 37.30s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 24%|██▍       | 24/100 [16:52<59:58, 47.35s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 25%|██▌       | 25/100 [18:01<1:07:01, 53.62s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 26%|██▌       | 26/100 [18:39<1:00:22, 48.96s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 27%|██▋       | 27/100 [19:10<53:13, 43.74s/it]  

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 28%|██▊       | 28/100 [19:47<49:53, 41.57s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 29%|██▉       | 29/100 [20:20<46:20, 39.16s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 30%|███       | 30/100 [21:05<47:28, 40.70s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 31%|███       | 31/100 [21:34<42:57, 37.35s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 32%|███▏      | 32/100 [23:02<59:25, 52.43s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 33%|███▎      | 33/100 [23:32<51:02, 45.71s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 34%|███▍      | 34/100 [24:03<45:37, 41.48s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 35%|███▌      | 35/100 [24:30<40:06, 37.02s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 36%|███▌      | 36/100 [24:58<36:43, 34.43s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 37%|███▋      | 37/100 [25:26<34:04, 32.45s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 38%|███▊      | 38/100 [26:05<35:27, 34.32s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 39%|███▉      | 39/100 [27:00<41:16, 40.60s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 40%|████      | 40/100 [27:35<38:55, 38.92s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 41%|████      | 41/100 [28:09<36:39, 37.28s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 42%|████▏     | 42/100 [29:07<42:15, 43.71s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 43%|████▎     | 43/100 [30:06<45:40, 48.09s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 44%|████▍     | 44/100 [31:21<52:32, 56.29s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 45%|████▌     | 45/100 [31:53<44:50, 48.92s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 46%|████▌     | 46/100 [32:27<40:03, 44.52s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 47%|████▋     | 47/100 [32:57<35:19, 39.99s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 48%|████▊     | 48/100 [33:32<33:36, 38.79s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 49%|████▉     | 49/100 [34:01<30:23, 35.75s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 50%|█████     | 50/100 [34:54<33:56, 40.74s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 51%|█████     | 51/100 [35:20<29:41, 36.36s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 52%|█████▏    | 52/100 [36:02<30:35, 38.24s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 53%|█████▎    | 53/100 [36:35<28:36, 36.52s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 54%|█████▍    | 54/100 [37:15<28:50, 37.63s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 55%|█████▌    | 55/100 [38:27<35:54, 47.88s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 56%|█████▌    | 56/100 [38:56<31:05, 42.40s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 57%|█████▋    | 57/100 [39:28<28:00, 39.08s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 58%|█████▊    | 58/100 [39:59<25:43, 36.76s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 59%|█████▉    | 59/100 [40:57<29:27, 43.12s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 60%|██████    | 60/100 [41:32<27:06, 40.65s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 61%|██████    | 61/100 [41:59<23:48, 36.62s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 62%|██████▏   | 62/100 [42:30<22:05, 34.88s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 63%|██████▎   | 63/100 [43:38<27:37, 44.81s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 64%|██████▍   | 64/100 [44:35<29:08, 48.57s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 65%|██████▌   | 65/100 [45:19<27:28, 47.11s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 66%|██████▌   | 66/100 [45:47<23:26, 41.36s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 67%|██████▋   | 67/100 [47:15<30:29, 55.44s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 68%|██████▊   | 68/100 [48:21<31:12, 58.51s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 69%|██████▉   | 69/100 [48:56<26:35, 51.46s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 70%|███████   | 70/100 [49:37<24:13, 48.46s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 71%|███████   | 71/100 [50:09<20:58, 43.40s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 72%|███████▏  | 72/100 [50:53<20:22, 43.67s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 73%|███████▎  | 73/100 [51:49<21:13, 47.18s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 74%|███████▍  | 74/100 [52:40<21:01, 48.50s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 75%|███████▌  | 75/100 [53:10<17:51, 42.85s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 76%|███████▌  | 76/100 [53:36<15:10, 37.93s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 77%|███████▋  | 77/100 [54:14<14:31, 37.88s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 78%|███████▊  | 78/100 [55:21<17:06, 46.68s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 79%|███████▉  | 79/100 [55:49<14:20, 40.96s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 80%|████████  | 80/100 [56:19<12:36, 37.83s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 81%|████████  | 81/100 [57:05<12:39, 40.00s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 82%|████████▏ | 82/100 [57:55<12:53, 42.99s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 83%|████████▎ | 83/100 [58:55<13:37, 48.11s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 84%|████████▍ | 84/100 [59:35<12:11, 45.70s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 85%|████████▌ | 85/100 [1:00:06<10:19, 41.28s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 86%|████████▌ | 86/100 [1:00:36<08:52, 38.07s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 87%|████████▋ | 87/100 [1:01:33<09:27, 43.67s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 88%|████████▊ | 88/100 [1:02:00<07:45, 38.79s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 89%|████████▉ | 89/100 [1:02:40<07:10, 39.16s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 90%|█████████ | 90/100 [1:03:12<06:08, 36.82s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 91%|█████████ | 91/100 [1:03:47<05:28, 36.50s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 92%|█████████▏| 92/100 [1:04:35<05:18, 39.79s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 93%|█████████▎| 93/100 [1:05:09<04:27, 38.20s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 94%|█████████▍| 94/100 [1:05:46<03:45, 37.59s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 95%|█████████▌| 95/100 [1:06:19<03:02, 36.48s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 96%|█████████▌| 96/100 [1:08:16<04:02, 60.54s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 97%|█████████▋| 97/100 [1:08:52<02:39, 53.18s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 98%|█████████▊| 98/100 [1:09:43<01:44, 52.34s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


 99%|█████████▉| 99/100 [1:10:28<00:50, 50.21s/it]

INFO:tensorflow:Restoring parameters from /home2/dalya/clinical_concept_extraction/clinical_concept_extraction/cce_assets/blstm/model


100%|██████████| 100/100 [1:10:59<00:00, 42.60s/it]


In [1]:
all_annotations

NameError: name 'all_annotations' is not defined

In [45]:
ori_texts = []

for i in tqdm(range(len(docs))):
    
    with open(path + docs[i], "r") as f:
        text = f.read()
        ori_texts.append(text)

100%|██████████| 100/100 [00:00<00:00, 17456.63it/s]


In [48]:
os.getcwd()

'/home2/dalya/clinical_concept_extraction/ukyoung'

In [63]:
total_length = 100

df_output = pd.DataFrame({'Did': did_list,
                          'Problem': problems_list,
                          'ProblemCount': [ len(problems_list[i].split('_')) for i in range(total_length) ],
                          'Test': tests_list,
                          'TestCount': [ len(tests_list[i].split('_')) for i in range(total_length) ],
                          'Treatment': treatments_list,
                          'TreatmentCount': [ len(treatments_list[i].split('_')) for i in range(total_length) ],
                          'Doc': ori_texts})

df_output.to_csv('docs/clinical_100docs_with_origins.csv', header=True, index=False)
df_output.to_excel('docs/clinical_100docs_with_origins.xlsx', header=True, index=False)

In [7]:
cbert_term_list = []

cbert_term_list = problems_list
#for i in range(len(topics)):
#    cbert_term_list.append(problems_list[i] + tests_list[i] + treatments_list[i])
#    problems_list

In [8]:
## GET Topics -> ClinicalBert -> Termlist
print(topics[0])
print(cbert_term_list[0])


A 19-year-old male came to clinic with some sexual concern.  He recently engaged in a relationship and is worried about the satisfaction of his girlfriend. He has a "baby face" according to his girlfriend's statement and he is not as muscular as his classmates.  On physical examination, there is some pubic hair and poorly developed secondary sexual characteristics. He is unable to detect coffee smell during the examination, but the visual acuity is normal. Ultrasound reveals the testes volume of 1-2 ml. The hormonal evaluation showed serum testosterone level of 65 ng/dL with low levels of GnRH.

['some pubic hair', 'poorly developed secondary sexual characteristics', 'coffee smell', 'low levels of GnRH']


In [10]:
def get_phrases(topic):
    phrases = re.findall("\$\([A-z\s\d\-]+\)", topic)

    p_list = []
    for p in phrases:
        p_list.append(p[2:-1])

    return p_list

biobert_term_list = []

for i in range(len(df)):
    biobert_term_list.append(get_phrases(df['topics'][i]))

In [11]:
df_output = pd.DataFrame({'Topic': range(1, 51),
                          'Bio': biobert_term_list})

df_output.to_csv('BioBert_terms_2022.csv', header=True, index=False)

In [31]:
print(topics[43])
print(cbert_term_list[43])
print(biobert_term_list[43])


A 48-year-old man comes to the office complaining of heartburn and acid reflux.  He has taken over-the-counter antacids but sees no relief.  Other medical history is unremarkable.  The patient does not use tobacco, alcohol, or illicit drugs.  Vital signs are within normal limits.  BMI is 31 kg/m2.  Physical examination is  positive for mild tenderness in upper stomach. Chest x-ray shows an air-fluid opacity behind the heart.  A barium swallow study reveals approximately 1/3 of the stomach herniating through the esophageal hiatus. 

['heartburn', 'acid reflux', 'mild tenderness in upper stomach', 'an air-fluid opacity behind the heart', 'the stomach']
['heartburn', 'antacids', 'tobacco', 'illicit drugs', 'BMI', 'stomach', 'x-ray', 'barium', 'stomach']


In [21]:
from elasticsearch import Elasticsearch

urls = ["http://210.117.182.30:9200"]
es_host = Elasticsearch(urls, request_timeout=60)

# The list of fields to search
es_fields_for_cui = ["cui"]

# The name of index
es_index_for_cui = "idx-cui-terms"

In [22]:
def get_expanded_by_cui(text):
    
    body = {
        "_source": ["terms"], 
        "query": {
            "match": {
                "terms": text
            }
        }
    }
    
    res_cui = es_host.search(index=es_index_for_cui, body=body)
    res_cui = res_cui.get("hits").get("hits")
    if res_cui == []:
        return " "
        
    res_cui = res_cui[0]
    qe_text = res_cui.get('_source').get('terms')
    
    return qe_text

In [23]:
get_expanded_by_cui("urinary retention")
#get_expanded_by_cui("MRI")



' urinary retention retention, urinary retention urinary bladder distention bladder retention of urine bladder urine retention retention of urine retention urine nos urine retention retention urine retention; urine urine; retention bladder retention retention;bladder bladder; retention bladder inability to empty inability to empty bladder unable to empty bladder unable to pass urine retention of urine, unspecified retention of urine unspecified cannot pass urine - retention not passing urine bladder retention of urine -retired- cannot pass urine - retention (& ) retention of urine unspecified (context-dependent category) retention of urine (context-dependent category) retention - symptom rndx urinary retention'