In [1]:
import json
import sys

import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import ngrams

from ontologyLoader import *

The module loads all the ontologies and dictionaries that will be used for weak labeling in distant-PICO


In [2]:
#loading the english language small model of spacy
import spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
additional_stopwords = ['of']
stopwords.update(additional_stopwords)

import string
from collections import Counter

In [3]:
def removePunctNum(term):
    term = term.translate(str.maketrans(' ', ' ', string.punctuation))
    return ''.join([i for i in term if not i.isdigit()])

In [4]:
def readManuallyAnnoted( input_file_path, label_type=None ):

    nct_ids = []
    tokens = []
    labels = []
    pos = []

    with open(input_file_path, 'r', encoding='latin1') as NCT_ids_file:

        for i, eachLine in enumerate(NCT_ids_file):
            annot = json.loads(eachLine)

            for doc_key, document_annotations in annot.items():

                nct_ids.append(doc_key)
                tokens.append(document_annotations[0])
                labels.append(document_annotations[1])
                # TODO: Generate dummy POS items
                pos_i = [0] * len( document_annotations[0] )
                pos.append( pos_i )

    corpus_df = pd.DataFrame(
        {'ids': nct_ids,
        'tokens': tokens,
        'labels': labels,
        'pos': pos
        })

    return corpus_df

In [5]:
umls_p, umls_i, umls_o = loadUMLS()

49
58
70


In [6]:
ebm_nlp_path = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/groundtruth/ebm_nlp/p/sentences.txt'
ebm_nlp = readManuallyAnnoted( ebm_nlp_path, label_type=None )
train, validation = train_test_split(ebm_nlp, test_size=0.2)

In [9]:
# Prepare ngrams from training set
total_corpus_terms = 0
clean_train = []
for tokens in train.tokens.tolist():
    clean_t = []
    for t in tokens:
        if t not in stopwords:
            t = removePunctNum(t)
            if len(t) > 1:
                clean_t.append( t.lower() )
                total_corpus_terms = total_corpus_terms + 1
    clean_train.append(clean_t)
    
print('Total number of terms in the corpus: ', total_corpus_terms)

Total number of terms in the corpus:  529010


In [10]:
def document_counter(zipper):
    
    document_counts_dict = dict()
    flattened = []
    
    for eachOne in zipper:
        for one in eachOne:
            flattened.append( one )

    # Document ngram counter
    document_counts = Counter(flattened)
    document_counts_tuples = document_counts.most_common()
    
    for eachCount in document_counts_tuples:
        document_counts_dict[ eachCount[0] ] = eachCount[1]

    return document_counts_dict

In [11]:
clean_train_2grams = [ngrams(tokens, 2) for tokens in clean_train]
clean_train_3grams = [ngrams(tokens, 3) for tokens in clean_train]
clean_train_4grams = [ngrams(tokens, 4) for tokens in clean_train]
clean_train_5grams = [ngrams(tokens, 5) for tokens in clean_train]
clean_train_6grams = [ngrams(tokens, 6) for tokens in clean_train]
clean_train_7grams = [ngrams(tokens, 7) for tokens in clean_train]
clean_train_8grams = [ngrams(tokens, 8) for tokens in clean_train]

In [12]:
counts_1grams = document_counter(clean_train)
counts_2grams = document_counter(clean_train_2grams)
counts_3grams = document_counter(clean_train_3grams)
counts_4grams = document_counter(clean_train_4grams)
counts_5grams = document_counter(clean_train_5grams)
counts_6grams = document_counter(clean_train_6grams)
counts_7grams = document_counter(clean_train_7grams)
counts_8grams = document_counter(clean_train_8grams)

In [13]:
counts_merged = {**counts_1grams, **counts_2grams, **counts_3grams, **counts_4grams, **counts_5grams, **counts_6grams, **counts_7grams, **counts_8grams}

In [14]:
def flattenTuple(zipper):
    flattened = []

    for eachOne in zipper:
        flattened.append( eachOne )
            
    return flattened

In [32]:
def getNgrams(term):
    
    all_grams = []
    
    termLength = len( term.split() )
    term_ = term.split()
    term_ = [t.lower() for t in term_]
    
    if termLength <= 2:
        all_grams.append( tuple(term_) )
        
    if termLength > 2:

        for l in range(2, 9):
            grams = ngrams(term_, l)
            flattened = flattenTuple(grams)
            all_grams.extend( flattened )    
    
    return all_grams

In [33]:
terminology_coverage_p = dict()

for i, (terminologyName, terms) in enumerate(umls_p.items()):
    
    terminology_coverage = []
    
    for term in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
            termFrequency = term_counter / total_corpus_terms
            terminology_coverage.append( termFrequency )

    terminology_coverage_p[terminologyName] = sum(terminology_coverage)

In [34]:
terminology_coverage_i = dict()

for i, (terminologyName, terms) in enumerate(umls_i.items()):
    
    terminology_coverage = []
    
    for term in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
            termFrequency = term_counter / total_corpus_terms
            terminology_coverage.append( termFrequency )

    terminology_coverage_i[terminologyName] = sum(terminology_coverage)

In [35]:
terminology_coverage_o = dict()

for i, (terminologyName, terms) in enumerate(umls_o.items()):
    
    terminology_coverage = []
    
    for term in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
            termFrequency = term_counter / total_corpus_terms 
            terminology_coverage.append( termFrequency )

    terminology_coverage_o[terminologyName] = sum(terminology_coverage)

In [36]:
tc_sorted_p = sorted(terminology_coverage_p.items(), key=lambda x: x[1], reverse=True)
print(tc_sorted_p)

[('SNOMEDCT_US', 58.69971078037866), ('MEDCIN', 52.3303661557509), ('ICD10CM', 22.609272036603958), ('NCI', 16.952008468680056), ('RCD', 7.3105877015835325), ('MDR', 7.118389066377173), ('MTH', 6.025695166473324), ('MTHICD9', 5.6792007712572055), ('ICD9CM', 4.999722122461224), ('PDQ', 4.201128523092464), ('ICD10AM', 3.271979735735125), ('ICD10', 3.225339785639087), ('RCDSY', 2.7759097181550296), ('SNMI', 2.309515888172502), ('NCI_CTRP', 1.8345929188489662), ('MSH', 1.6103722046855462), ('OMIM', 1.1358310806981682), ('ICPC2ICD10ENG', 0.7269749154079655), ('CHV', 0.6226177198918335), ('RCDAE', 0.5957789077710693), ('CCPSS', 0.5804030169561744), ('LNC', 0.35895918791703346), ('HPO', 0.3565017674523902), ('ICD10AMAE', 0.29572786903838577), ('ICD10AE', 0.2363962874047746), ('NCI_NCI-GLOSS', 0.23219598873367092), ('NCI_NICHD', 0.22595414075347628), ('RCDSA', 0.22558930833065352), ('SNOMEDCT_VET', 0.20807168106462193), ('SNM', 0.17035027693237467), ('NCI_CELLOSAURUS', 0.1666830494697612), ('N

In [38]:
tc_sorted_i = sorted(terminology_coverage_i.items(), key=lambda x: x[1], reverse=True)
print(tc_sorted_i)

[('ICD10PCS', 304.004466820003), ('MTHSPL', 148.18494923950414), ('CPT', 96.90423243366982), ('RXNORM', 95.29885824324481), ('MEDCIN', 80.04924481501466), ('SNOMEDCT_US', 79.09295287348715), ('MMX', 43.973520349936685), ('NIC', 24.271355929050483), ('HCPCS', 23.915007277756004), ('MTH', 22.925163985597102), ('MMSL', 12.698230657326329), ('NDDF', 12.013219031836156), ('HCPT', 11.08683389729258), ('RCD', 5.435156235239952), ('MSH', 4.624052475387484), ('LNC', 4.585323528863873), ('SNMI', 4.062601841180492), ('NCI', 3.8540897147524915), ('GS', 3.2132568382479225), ('ICD10AM', 1.597971682956563), ('PCDS', 1.5710648191914018), ('PDQ', 1.2614903309957244), ('MTHICD9', 0.7805542428308683), ('RCDAE', 0.5878962590498545), ('NCI_CTRP', 0.5494508610423023), ('DRUGBANK', 0.5270788831968807), ('MDR', 0.40784862289935664), ('CHV', 0.3902459310787843), ('ALT', 0.3897638986030204), ('ICD9CM', 0.36188540859337304), ('NCI_CDISC', 0.3421201867639542), ('MED-RT', 0.29206442222263074), ('VANDF', 0.28286988

In [37]:
tc_sorted_o = sorted(terminology_coverage_o.items(), key=lambda x: x[1], reverse=True)
print(tc_sorted_o)

[('MEDCIN', 640.5954878288453), ('LNC', 134.6945142699858), ('SNOMEDCT_US', 86.00480520145268), ('CPT', 68.11327196082932), ('ICD10PCS', 54.903992363361645), ('HCPCS', 28.498308160547886), ('MTH', 25.34276289677039), ('NCI', 25.181155365721093), ('NIC', 21.20653484814635), ('HCPT', 9.128228199848984), ('RCD', 7.44609931760884), ('ICD10CM', 6.546023704678359), ('OMIM', 4.169803973465156), ('GO', 3.951496191000554), ('NCI_caDSR', 3.351749494340747), ('UMD', 2.864989319675145), ('NCI_CDISC', 2.5719249163546203), ('SNMI', 2.2823576113894286), ('MDR', 2.151802423396248), ('GS', 2.115169845560748), ('NOC', 1.6692727925755138), ('ICD9CM', 1.5548193795965348), ('HPO', 1.3338348991521383), ('ICD10AM', 1.2248123853994415), ('RCDSY', 1.1481767830477192), ('ICD10', 1.1424869095102976), ('MTHICD9', 0.7849454641688404), ('CHV', 0.7397667340881537), ('NCI_CTRP', 0.652613372147914), ('NANDA-I', 0.6364019583750485), ('NDDF', 0.6184552276894429), ('MMSL', 0.5364000680516262), ('MSH', 0.5203171962722485)