In [1]:
import json
import sys

import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import ngrams

from collections import OrderedDict

In [2]:
import os
os.getcwd()

'/home/anjani/distant-PICO/CandidateGeneration/Ontologies'

In [3]:
import sys
sys.path.append('/home/anjani/distant-PICO/CandidateGeneration/')

In [4]:
outdir = '/mnt/nas2/results/Results/systematicReview/distant_pico/coverage_results/'

In [5]:
from ontologyLoader import *

The module loads all the ontologies and dictionaries that will be used for weak labeling in distant-PICO
The module maps CUIs to TUIs from the selected UMLS subset


In [6]:
#loading the english language small model of spacy
import spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
additional_stopwords = ['of']
stopwords.update(additional_stopwords)

import string
from collections import Counter

In [7]:
def removePunctNum(term):
    term = term.translate(str.maketrans(' ', ' ', string.punctuation))
    return ''.join([i for i in term if not i.isdigit()])

In [8]:
def readManuallyAnnoted( input_file_path, label_type=None ):

    nct_ids = []
    tokens = []
    labels = []
    pos = []

    with open(input_file_path, 'r', encoding='latin1') as NCT_ids_file:

        for i, eachLine in enumerate(NCT_ids_file):
            annot = json.loads(eachLine)

            for doc_key, document_annotations in annot.items():

                nct_ids.append(doc_key)
                tokens.append(document_annotations[0])
                labels.append(document_annotations[1])
                # TODO: Generate dummy POS items
                pos_i = [0] * len( document_annotations[0] )
                pos.append( pos_i )

    corpus_df = pd.DataFrame(
        {'ids': nct_ids,
        'tokens': tokens,
        'labels': labels,
        'pos': pos
        })

    return corpus_df

In [9]:
# umls_db = '/mnt/nas2/data/systematicReview/UMLS/english_subset/umls_preprocessed/umls_pre.db'
umls_db = '/mnt/nas2/data/systematicReview/UMLS/english_subset/umls_preprocessed/umls_tui_pio4_.db'


print('Retrieving UMLS ontology arm (Preprocessing applied)')
umls_p  = loadUMLSdb(umls_db, 'P')    
umls_i = loadUMLSdb(umls_db, 'I')
umls_o = loadUMLSdb(umls_db, 'O')

Retrieving UMLS ontology arm (Preprocessing applied)
This is the connection:  <sqlite3.Connection object at 0x7feffe43b990>
Dataframe size:  147
This is the connection:  <sqlite3.Connection object at 0x7feffe697c60>
Dataframe size:  147
This is the connection:  <sqlite3.Connection object at 0x7fedb58e8300>
Dataframe size:  147


In [10]:
ebm_nlp_path_p = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/groundtruth/ebm_nlp/p/sentences.txt'
ebm_nlp_p = readManuallyAnnoted( ebm_nlp_path_p, label_type=None )

In [11]:
ebm_nlp_path_i = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/groundtruth/ebm_nlp/i/sentences.txt'
ebm_nlp_i = readManuallyAnnoted( ebm_nlp_path_i, label_type=None )

In [12]:
ebm_nlp_path_o = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/groundtruth/ebm_nlp/o/sentences.txt'
ebm_nlp_O = readManuallyAnnoted( ebm_nlp_path_o, label_type=None )

In [13]:
# Prepare ngrams from training set
total_corpus_terms = 0
clean_train = []
for tokens in ebm_nlp_p.tokens.tolist():
    clean_t = []
    for t in tokens:
        if t not in stopwords: # Remove stopwords
            t = removePunctNum(t) # Remove punctuations
            if len(t) > 1:
                clean_t.append( t.lower() )
                total_corpus_terms = total_corpus_terms + 1
    clean_train.append(clean_t)
    
print('Total number of terms in the corpus: ', total_corpus_terms)

Total number of terms in the corpus:  661125


In [14]:
def document_counter(zipper):
    
    document_counts_dict = dict()
    flattened = []
    
    for eachOne in zipper:
        for one in eachOne:
            flattened.append( one )

    # Document ngram counter
    document_counts = Counter(flattened)
    document_counts_tuples = document_counts.most_common()
    
    for eachCount in document_counts_tuples:
        document_counts_dict[ eachCount[0] ] = eachCount[1]

    return document_counts_dict

In [15]:
clean_train_2grams = [ngrams(tokens, 2) for tokens in clean_train]
clean_train_3grams = [ngrams(tokens, 3) for tokens in clean_train]
clean_train_4grams = [ngrams(tokens, 4) for tokens in clean_train]
clean_train_5grams = [ngrams(tokens, 5) for tokens in clean_train]
clean_train_6grams = [ngrams(tokens, 6) for tokens in clean_train]
clean_train_7grams = [ngrams(tokens, 7) for tokens in clean_train]
clean_train_8grams = [ngrams(tokens, 8) for tokens in clean_train]

In [16]:
counts_1grams = document_counter(clean_train)
counts_2grams = document_counter(clean_train_2grams)
counts_3grams = document_counter(clean_train_3grams)
counts_4grams = document_counter(clean_train_4grams)
counts_5grams = document_counter(clean_train_5grams)
counts_6grams = document_counter(clean_train_6grams)
counts_7grams = document_counter(clean_train_7grams)
counts_8grams = document_counter(clean_train_8grams)

In [17]:
counts_merged = {**counts_1grams, **counts_2grams, **counts_3grams, **counts_4grams, **counts_5grams, **counts_6grams, **counts_7grams, **counts_8grams}

In [18]:
def flattenTuple(zipper):
    flattened = []

    for eachOne in zipper:
        flattened.append( eachOne )
            
    return flattened

In [19]:
def getNgrams(term):
    
    all_grams = []
    
    termLength = len( term.split() )
    term_ = term.split()
    term_ = [t.lower() for t in term_]
    
    if termLength <= 2:
        all_grams.append( tuple(term_) )
        
    if termLength > 2:

        for l in range(2, 9):
            grams = ngrams(term_, l)
            flattened = flattenTuple(grams)
            all_grams.extend( flattened )    
    
    return all_grams

In [20]:
terminology_coverage_p = dict()

for i, (terminologyName, terms) in enumerate(umls_p.items()):
    
    terminology_coverage = []
    
    for term, term_label in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged and len( term_label ) == 1:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
            termFrequency = term_counter / total_corpus_terms
            terminology_coverage.append( termFrequency )

    terminology_coverage_p[terminologyName] = sum(terminology_coverage)

In [21]:
terminology_coverage_i = dict()

for i, (terminologyName, terms) in enumerate(umls_i.items()):
    
    terminology_coverage = []
    
    for term, term_label in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged and len( term_label ) == 1:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
            termFrequency = term_counter / total_corpus_terms
            terminology_coverage.append( termFrequency )

    terminology_coverage_i[terminologyName] = sum(terminology_coverage)

In [22]:
terminology_coverage_o = dict()

for i, (terminologyName, terms) in enumerate(umls_o.items()):
    
    terminology_coverage = []
    
    for term, term_label in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged and len( term_label ) == 1:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
            termFrequency = term_counter / total_corpus_terms 
            terminology_coverage.append( termFrequency )

    terminology_coverage_o[terminologyName] = sum(terminology_coverage)

In [23]:
tc_sorted_p = sorted(terminology_coverage_p.items(), key=lambda x: x[1], reverse=True)
tc_sorted_p = OrderedDict(tc_sorted_p)
print(tc_sorted_p)

OrderedDict([('MEDCIN', 547.4225479340062), ('ICD10PCS', 199.1878268158379), ('SNOMEDCT_US', 102.614003402653), ('LNC', 93.55885649391792), ('CPT', 52.29364038574635), ('NCI', 32.689863490274), ('HCPCS', 25.199620344108297), ('MTH', 22.07663754962556), ('NIC', 18.415484212513732), ('ICD10CM', 14.836342597820956), ('MDR', 9.025346190199668), ('HCPT', 7.206691624120196), ('RCD', 6.899646814135542), ('OMIM', 4.9980018907152335), ('FMA', 4.237773492148084), ('SNMI', 3.953645679715987), ('NCI_CDISC', 3.946926829268506), ('PDQ', 3.844752505199762), ('MTHICD9', 2.993391567405085), ('ICD9CM', 2.8425108716203566), ('NCI_CTRP', 2.2160211760257282), ('ICD10AM', 2.1081686519196063), ('ICPC2ICD10ENG', 2.022519190773542), ('MSH', 1.9068179239931526), ('UWDA', 1.8494883720946205), ('NCI_caDSR', 1.6435999243715655), ('NOC', 1.4671022877671533), ('UMD', 1.3957224428058157), ('CHV', 1.1807177160144637), ('MTHSPL', 1.1694823217999009), ('HPO', 0.9400294951787995), ('CCPSS', 0.8300397050482525), ('PCDS', 

In [24]:
import json
with open(f'{outdir}/participant_UMLS_v4_coverage.json', 'w+') as fp:
    json.dump(tc_sorted_p, fp)

In [25]:
tc_sorted_i = sorted(terminology_coverage_i.items(), key=lambda x: x[1], reverse=True)
tc_sorted_i = OrderedDict(tc_sorted_i)
print(tc_sorted_i)

OrderedDict([('MEDCIN', 531.9849378001024), ('ICD10PCS', 235.23591303401454), ('SNOMEDCT_US', 85.89745812003696), ('CPT', 80.14220155033259), ('HCPCS', 26.06397277367791), ('NCI', 24.20677935335842), ('LNC', 23.4351794289685), ('MTH', 18.659949328781853), ('NIC', 18.435105312910622), ('HCPT', 10.355519757976763), ('RCD', 5.416757799202713), ('MSH', 4.6020147475884805), ('OMIM', 4.3530951030449865), ('NCI_CDISC', 3.9066152391760465), ('SNMI', 3.5221206277196724), ('MMX', 2.5544972584605663), ('MDR', 2.3819232369071406), ('UMD', 2.192170920779517), ('RXNORM', 2.1411956891665027), ('HGNC', 1.988893174514712), ('ICD10CM', 1.7545033087540962), ('GS', 1.6694830780865384), ('NCI_caDSR', 1.5376456797127933), ('NOC', 1.3993148043108985), ('NDDF', 1.3215670259026364), ('PDQ', 1.2613045944412378), ('ICD10AM', 1.0910372471167704), ('CHV', 0.868971828323007), ('PCDS', 0.7521089052751626), ('MMSL', 0.7503089430894926), ('HPO', 0.7165180563433992), ('MTHSPL', 0.6770973719039862), ('MTHICD9', 0.646107

In [26]:
import json
with open(f'{outdir}/intervention_UMLS_v4_coverage.json', 'w+') as fp:
    json.dump(tc_sorted_i, fp)

In [27]:
tc_sorted_o = sorted(terminology_coverage_o.items(), key=lambda x: x[1], reverse=True)
tc_sorted_o = OrderedDict(tc_sorted_o)
print(tc_sorted_o)

OrderedDict([('MEDCIN', 546.2527343578905), ('ICD10PCS', 235.2290429262423), ('SNOMEDCT_US', 111.85807676198993), ('LNC', 95.52642541026417), ('CPT', 54.89698619782062), ('NCI', 22.78003857060615), ('MTH', 17.387137076927615), ('HCPCS', 16.472468897714457), ('ICD10CM', 12.919986386818751), ('HCPT', 7.8344307052354045), ('RCD', 5.677026280957083), ('MDR', 5.445480052938512), ('FMA', 5.312391756458588), ('OMIM', 4.75448969559449), ('SNMI', 4.6293106447336365), ('NCI_CDISC', 4.230929098128245), ('MSH', 2.6888803176419), ('MTHICD9', 2.64857931556127), ('ICD9CM', 2.5306182643223587), ('RXNORM', 2.136046889771533), ('ICD10AM', 2.1139769332582032), ('UWDA', 1.962819436567991), ('ICPC2ICD10ENG', 1.8606314993384272), ('MMX', 1.8455602193231453), ('NCI_caDSR', 1.5667264133108358), ('NOC', 1.4106681792400093), ('GO', 1.2892584609568092), ('PDQ', 1.257387029684195), ('NIC', 1.1915946303648888), ('CHV', 1.12374815655144), ('HPO', 0.8910750614484136), ('SNM', 0.6811782945737199), ('CCPSS', 0.6448750

In [28]:
import json
with open(f'{outdir}/outcome_UMLS_v4_coverage.json', 'w+') as fp:
    json.dump(tc_sorted_o, fp)