In [1]:
import json
import sys

import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import ngrams

from ontologyLoader import *

The module loads all the ontologies and dictionaries that will be used for weak labeling in distant-PICO


In [2]:
#loading the english language small model of spacy
import spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
additional_stopwords = ['of']
stopwords.update(additional_stopwords)

import string
from collections import Counter

In [3]:
def removePunctNum(term):
    term = term.translate(str.maketrans(' ', ' ', string.punctuation))
    return ''.join([i for i in term if not i.isdigit()])

In [4]:
def readManuallyAnnoted( input_file_path, label_type=None ):

    nct_ids = []
    tokens = []
    labels = []
    pos = []

    with open(input_file_path, 'r', encoding='latin1') as NCT_ids_file:

        for i, eachLine in enumerate(NCT_ids_file):
            annot = json.loads(eachLine)

            for doc_key, document_annotations in annot.items():

                nct_ids.append(doc_key)
                tokens.append(document_annotations[0])
                labels.append(document_annotations[1])
                # TODO: Generate dummy POS items
                pos_i = [0] * len( document_annotations[0] )
                pos.append( pos_i )

    corpus_df = pd.DataFrame(
        {'ids': nct_ids,
        'tokens': tokens,
        'labels': labels,
        'pos': pos
        })

    return corpus_df

In [5]:
umls_p, umls_i, umls_o = loadUMLS()

49
58
70


In [6]:
ebm_nlp_path = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/groundtruth/ebm_nlp/p/sentences.txt'
ebm_nlp = readManuallyAnnoted( ebm_nlp_path, label_type=None )
train, validation = train_test_split(ebm_nlp, test_size=0.2)

In [9]:
# Prepare ngrams from training set
total_corpus_terms = 0
clean_train = []
for tokens in train.tokens.tolist():
    clean_t = []
    for t in tokens:
        if t not in stopwords:
            t = removePunctNum(t)
            if len(t) > 1:
                clean_t.append( t.lower() )
                total_corpus_terms = total_corpus_terms + 1
    clean_train.append(clean_t)
    
print('Total number of terms in the corpus: ', total_corpus_terms)

Total number of terms in the corpus:  529010


In [10]:
def document_counter(zipper):
    
    document_counts_dict = dict()
    flattened = []
    
    for eachOne in zipper:
        for one in eachOne:
            flattened.append( one )

    # Document ngram counter
    document_counts = Counter(flattened)
    document_counts_tuples = document_counts.most_common()
    
    for eachCount in document_counts_tuples:
        document_counts_dict[ eachCount[0] ] = eachCount[1]

    return document_counts_dict

In [11]:
clean_train_2grams = [ngrams(tokens, 2) for tokens in clean_train]
clean_train_3grams = [ngrams(tokens, 3) for tokens in clean_train]
clean_train_4grams = [ngrams(tokens, 4) for tokens in clean_train]
clean_train_5grams = [ngrams(tokens, 5) for tokens in clean_train]
clean_train_6grams = [ngrams(tokens, 6) for tokens in clean_train]
clean_train_7grams = [ngrams(tokens, 7) for tokens in clean_train]
clean_train_8grams = [ngrams(tokens, 8) for tokens in clean_train]

In [12]:
counts_1grams = document_counter(clean_train)
counts_2grams = document_counter(clean_train_2grams)
counts_3grams = document_counter(clean_train_3grams)
counts_4grams = document_counter(clean_train_4grams)
counts_5grams = document_counter(clean_train_5grams)
counts_6grams = document_counter(clean_train_6grams)
counts_7grams = document_counter(clean_train_7grams)
counts_8grams = document_counter(clean_train_8grams)

In [13]:
counts_merged = {**counts_1grams, **counts_2grams, **counts_3grams, **counts_4grams, **counts_5grams, **counts_6grams, **counts_7grams, **counts_8grams}

In [14]:
def flattenTuple(zipper):
    flattened = []

    for eachOne in zipper:
        flattened.append( eachOne )
            
    return flattened

In [15]:
def getNgrams(term):
    
    all_grams = []
    
    termLength = len( term.split() )
    term_ = term.split()
    term_ = [t.lower() for t in term_]
    
    if termLength <= 2:
        all_grams.append( tuple(term_) )
        
    if termLength > 2:

        for l in range(3, 9):
            grams = ngrams(term_, l)
            flattened = flattenTuple(grams)
            all_grams.extend( flattened )    
    
    return all_grams

In [16]:
terminology_coverage_p = dict()

for i, (terminologyName, terms) in enumerate(umls_p.items()):
    
    terminology_coverage = []
    
    for term in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
        termFrequency = term_counter / total_corpus_terms
        terminology_coverage.append( termFrequency )

    terminology_coverage_p[terminologyName] = sum(terminology_coverage)

In [17]:
terminology_coverage_i = dict()

for i, (terminologyName, terms) in enumerate(umls_i.items()):
    
    terminology_coverage = []
    
    for term in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
        termFrequency = term_counter / total_corpus_terms
        terminology_coverage.append( termFrequency )

    terminology_coverage_i[terminologyName] = sum(terminology_coverage)

In [None]:
terminology_coverage_o = dict()

for i, (terminologyName, terms) in enumerate(umls_o.items()):
    
    terminology_coverage = []
    
    for term in terms:
        
        term_counter = 0
        
        term_i = getNgrams(term)
        for t in term_i:
            if t in counts_merged:
                term_count = counts_merged[t]
                term_counter = term_counter + term_count
            
        termFrequency = term_counter / total_corpus_terms 
        terminology_coverage.append( termFrequency )

    terminology_coverage_o[terminologyName] = sum(terminology_coverage)

In [None]:
sort_orders = sorted(terminology_coverage_o.items(), key=lambda x: x[1], reverse=True)