# Inspect the results of candidate generation process

In [357]:
#!/usr/bin/env python

import sys, json, os
import logging
import datetime as dt
import time
import random 

from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search,  Q

import difflib, re
import nltk
from nltk.tokenize import WhitespaceTokenizer, sent_tokenize, word_tokenize

from collections import Counter
from collections import defaultdict
import collections
import numpy as np
import pandas as pd
from itertools import islice
import ast
import json
import more_itertools as mit

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import *



This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/home/anjani/anaconda3/envs/systreviewclassifi/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/anjani/anaconda3/envs/systreviewclassifi/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/anjani/anaconda3/envs/systreviewclassifi/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/anjani/anaconda3/envs/systreviewclassifi/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/anjani/anaconda3/envs/systreviewclassifi/lib/python3.6/site-packages/ipykernel/ker

In [109]:
def getInterventionNames(protocol_section):
    # Check if the protocol section has interventions list
    if 'ArmsInterventionsModule' in protocol_section:
        if 'InterventionList' in protocol_section['ArmsInterventionsModule']:
            if 'Intervention' in protocol_section['ArmsInterventionsModule']['InterventionList']:
                intervention = protocol_section['ArmsInterventionsModule']['InterventionList']['Intervention']
                # print(intervention)
                for eachIntervention in intervention:
                    yield eachIntervention

def getArmsGroups(protocol_section):
    # Check if the protocol section has arm groups list
    if 'ArmsInterventionsModule' in protocol_section:
        if 'ArmGroupList' in protocol_section['ArmsInterventionsModule']:
            if 'ArmGroup' in protocol_section['ArmsInterventionsModule']['ArmGroupList']:
                armGroup = protocol_section['ArmsInterventionsModule']['ArmGroupList']['ArmGroup']
                return armGroup
            
            
################################################################################
# Library imports
################################################################################
import re

################################################################################
# Preprocessing functions
################################################################################
# Lower case source and targets
def lowercaseString(s):
    return s.lower()

def lowercaseDict(s):
        #return dict((k, v.lower()) for k,v in s.items())
        return dict(eval(repr(s).lower()))

# Remove punctuations
def removeHyphenString(s):
    return s.replace('-', ' ')

def removeSpaceTrailsString(s):
    return " ".join(s.split())


def clean_unicide(s):
    s_encode = s.encode("ascii", "ignore")
    return s_encode.decode()

# Normalize symbols like + (plus)

def preprocess_targets(s):

    if isinstance(s, str):
        modified_s = lowercaseString(s)

    modified_s = removeSpaceTrailsString(modified_s)
    modified_s = removeHyphenString(modified_s) 
    modified_s = clean_unicide(modified_s)    

    return modified_s

In [115]:
def partMatchScore(s, interventionName):
    match_score = sum( n for i,j,n in s.get_matching_blocks() ) / float(len(interventionName)) # if this = 1.0, then entire Source text(intervention name) matched with the Target text
    return match_score

def fullMatchScore(s, interventionName, briefTitleTarget):
    # match_score = sum( n for i,j,n in s.get_matching_blocks() ) / float(len(interventionName))
    all_match_scores = []
    # Return the most confident block (most confident block is block / len(interventionName) == 1.0)
    for eachMatchingBlock in s.get_matching_blocks()[:-1]: # Last block is a dummy block
        match_score = eachMatchingBlock[2] / float(len(interventionName))
        score_block = [match_score, eachMatchingBlock]
        all_match_scores.append( score_block )
        # yield score_block
    return all_match_scores

In [113]:
def extract1Annotation(source, target, match):
        
    token = list()
    annot = list()
    
    span_generator = WhitespaceTokenizer().span_tokenize(target)

    annotation_start_position = match[1][0]
    annotation_stop_position = match[1][0] + match[1][2]

    annotation = [0] * len(target)
    for n, i in enumerate(annotation):
        if n >= annotation_start_position and n <= annotation_stop_position:
            annotation[n] = 1

    for span in span_generator:
        token_ = target[span[0]:span[1]]
        annot_ = annotation[span[0]:span[1]]

        if len(set(annot_)) > 1 and len(annot_) <= 2:
            max_element = 0
        else:
            max_element_i = Counter(annot_)
            max_element = max_element_i.most_common(1)[0][0]

        token.append(token_)
        annot.append(max_element)

    # Check if the number of annotations match number of tokens present in the sentence
    assert len(token) == len(annot)
        
    return token, annot


def extract09Annotation(source, target, match):
      
    token = list()
    annot = list()
    
    span_generator = WhitespaceTokenizer().span_tokenize(target)

    annotation_start_position = match[1][0]
    annotation_stop_position = match[1][0] + match[1][2] # example: Match(a=171, b=2, size=23)

    annotation = [0] * len(target)
    for n, i in enumerate(annotation):
        if n >= annotation_start_position and n <= annotation_stop_position:
            annotation[n] = 1

    for span in span_generator:

        token_ = target[span[0]:span[1]]
        annot_ = annotation[span[0]:span[1]]
        
        if len(set(annot_)) > 1 and len(annot_) <= 2:
            max_element = 0
        else:
            max_element_i = Counter(annot_)
            max_element = max_element_i.most_common(1)[0][0]            

        token.append(token_)
        annot.append(max_element)

    # Check if the number of annotations match number of tokens present in the sentence
    assert len(token) == len(annot)
        
    return token, annot

In [110]:
def align_highconf_shorttarget(target, source):
    annot = list() # Get's updated for each Intervention name identified
    token = list()

    if target is not None:
        # Match the source to the target
        s = difflib.SequenceMatcher(None, target, source, autojunk=True)
        matches = fullMatchScore(s, source, target)
        for match in matches:
            if match[0] == 1.0:                    
                token, annot = extract1Annotation(source, target, match)

    assert len(token) == len(annot)
    return token, annot

# Collects annotations for all the sentences in the target and returns them
def align_highconf_longtarget(target, source):

    target_sentences = list()
   
    if target is not None :
        # Sentence tokenization
        target_sentences = sent_tokenize(target)
        collect_annotations = dict()
        
        # Iterate each sentence
        for i, eachSentence in enumerate(target_sentences):

            annot = list() # Get's updated for each Intervention name identified and for each sentence
            token = list()

            s = difflib.SequenceMatcher(None, eachSentence, source, autojunk=True)
            matches = fullMatchScore(s, source, target)
            match_scores = [item[0] for item in matches ]

            if 1.0 in match_scores or True in list(all(i >= 0.9 and i < 1.0 for i in match_scores)):
                for match in matches:
                    if match[0] == 1.0:         
                        token_i, annot_i = extract1Annotation(source, eachSentence, match)
                        annot.extend( annot_i )
                        token.extend( token_i )
                    #if match[0] >= 0.9 and match[0] < 1.0:
                    #    token_i, annot_i = extract09Annotation(source, eachSentence, match)
                    #    annot.extend( annot_i )
                    #    token.extend( token_i )
                
            if annot:
                token_annot = [token, annot, [eachTuple[1]  for eachTuple in nltk.pos_tag_sents([token])[0]]]
                collect_annotations['sentence' + str(i)] = token_annot

    assert len(token) == len(annot)

    return collect_annotations


def align_highconf_longtarget_negSent(target, source):

    target_sentences = list()
   
    if target is not None :
        # Sentence tokenization
        target_sentences = sent_tokenize(target)
        collect_annotations = dict()
        
        # Check if each sentence has the source intervention in it
        for i, eachSentence in enumerate(target_sentences):

            annot = list() # Get's updated for each Intervention name identified and for each sentence
            token = list()

            s = difflib.SequenceMatcher(None, eachSentence, source, autojunk=True)
            matches = fullMatchScore(s, source, target)
            match_scores = [item[0] for item in matches ]

            if 1.0 in match_scores or True in list(all(i >= 0.9 and i < 1.0 for i in match_scores)):
                for match in matches:
                    if match[0] == 1.0:         
                        token_i, annot_i = extract1Annotation(source, eachSentence, match)
                        annot.extend( annot_i )
                        token.extend( token_i )
                    #if match[0] >= 0.9 and match[0] < 1.0:
                    #    token_i, annot_i = extract09Annotation(source, eachSentence, match)
                    #    annot.extend( annot_i )
                    #    token.extend( token_i )

            if annot:
                token_annot = [token, annot, [eachTuple[1]  for eachTuple in nltk.pos_tag_sents([token])[0]]]
                # collect_annotations['sentence' + str(i)] = token_annot
                collect_annotations[str(i)] = token_annot

            if 1.0 not in match_scores and all( list(all(i <= 0.20 for i in match_scores)) ): # very negative sentences (possibility of false negative candidates)
                tokenized_negative_sentence_i = eachSentence.split(' ')
                annotation_negative_sentence_i = [0] * len(tokenized_negative_sentence_i)
                annot.extend( annotation_negative_sentence_i )
                token.extend( tokenized_negative_sentence_i )

            if annot:
                token_annot = [token, annot, [eachTuple[1]  for eachTuple in nltk.pos_tag_sents([token])[0]]]
                # collect_annotations['sentence' + str(i)] = token_annot
                collect_annotations[str(i)] = token_annot

    assert len(token) == len(annot)
    return collect_annotations

In [111]:
################################################################################
# Instantiate ElasticSearch
################################################################################
es = Elasticsearch( [{u'host': u'127.0.0.1', u'port': b'9200'}] )

In [85]:
################################################################################
# Get all the documents from the index
################################################################################
# Scan all of the CTO index
results_gen = helpers.scan(
    es,
    query={"query": {"match_all": {}}},
    index='ctofull-index',
    size=1000,
    scroll="60m",
)

## Confidence 0.9 and above

### Question 1: Number of terms, class and class distribution identified from the sources

In [86]:
intervention_terms = []
intervention_class = []

res = es.search(index="ctofull-index", body={"query": {"match_all": {}}}, size=100)
print('Total number of records retrieved: ', res['hits']['total']['value'])
for hit in results_gen: # XXX: Entire CTO
# for n, hit in enumerate( res['hits']['hits'] ): # XXX: Only a part search results from the CTO
    fullstudy = hit['_source']['FullStudiesResponse']['FullStudies'][0]['Study']
    NCT_id = hit['_source']['FullStudiesResponse']['Expression']
    
    try:

        protocol_section = fullstudy['ProtocolSection']
        derieved_section = fullstudy['DerivedSection']
        
        ################################################################################
        # Get and preprocess sources
        ################################################################################
        # Source 1: Interventions
        interventionSource = getInterventionNames(protocol_section)
        
        # Source 2: Arms Groups
        armGroup = getArmsGroups(protocol_section)
        
        # XXX Each individual intervention term is iterated here
        for eachInterventionSource in interventionSource:
                       
            # Lower case the intervention source dictionary
            eachInterventionSource = lowercaseDict(eachInterventionSource)
            
            if 'interventionname' in eachInterventionSource:
                intervention_terms.append(eachInterventionSource['interventionname'])
                
            if 'interventionothernamelist' in eachInterventionSource:
                names = eachInterventionSource['interventionothernamelist']['interventionothername']
                for eachName in names:
                    intervention_terms.append(eachName)
                    
            intervention_class.append(eachInterventionSource['interventiontype'])

    except:
        logNCTID = 'Caused exception at the NCT ID: ' + NCT_id

print('Total number of intervention terms retrieved: ', len(intervention_terms) )
print('Total number of unique intervention terms retrieved: ', len(set(intervention_terms)) )
print('Total number of classes retrieved: ', len(intervention_class))

class_dist = Counter(intervention_class)
print(class_dist)

Total number of records retrieved:  10000
Total number of intervention terms retrieved:  933922
Total number of unique intervention terms retrieved:  371908
Total number of classes retrieved:  619174
Counter({'drug': 289246, 'other': 92439, 'device': 58889, 'procedure': 52662, 'behavioral': 52117, 'biological': 32389, 'dietary supplement': 18343, 'diagnostic test': 9679, 'radiation': 8877, 'genetic': 2961, 'combination product': 1572})


#####  'Counter({'drug': 289246, 'other': 92439, 'device': 58889, 'procedure': 52662, 'behavioral': 52117, 'biological': 32389, 'dietary supplement': 18343, 'diagnostic test': 9679, 'radiation': 8877, 'genetic': 2961, 'combination product': 1572})'

In [87]:
labels = []
sizes = []

for eachIn in class_dist.most_common():
    percent = (eachIn[1]/ len(intervention_class)) * 100
    print( eachIn[0] , ' : ',  percent)
    labels.append( eachIn[0] )
    sizes.append( percent )

drug  :  46.71481683662428
other  :  14.929405950508256
device  :  9.510896775381395
procedure  :  8.505202091819102
behavioral  :  8.417181600002584
biological  :  5.231001301734246
dietary supplement  :  2.9624951952116856
diagnostic test  :  1.5632116335634247
radiation  :  1.4336842309270093
genetic  :  0.4782177546214796
combination product  :  0.25388662960654035


In [88]:
import pandas as pd
df = pd.DataFrame(list(zip(labels, sizes)), columns =['labels', 'sizes'])

In [89]:
df

Unnamed: 0,labels,sizes
0,drug,46.714817
1,other,14.929406
2,device,9.510897
3,procedure,8.505202
4,behavioral,8.417182
5,biological,5.231001
6,dietary supplement,2.962495
7,diagnostic test,1.563212
8,radiation,1.433684
9,genetic,0.478218


### Question 2: Number of terms, class and class distribution unmapped to targets from the sources

In [7]:
retrieved = '/home/anjani/distant-cto/ResultInspection/candidategeneration/data/retrieved_intervention.csv'
mapped = '/home/anjani/distant-cto/ResultInspection/candidategeneration/data/mapped_interventions.csv'

In [60]:
def getData(input_file):
    intclass_d = dict()

    with open(input_file, 'r') as inpf:
        inpf.__next__()
        for eachLine in inpf:
            if len(eachLine) > 5:
                value = eachLine.split('\t')[1]
                key = eachLine.split('\t')[2]
                key = key.rstrip('\n')
                if 'class' not in key:
                    if key not in intclass_d:
                        intclass_d[key] = [value.replace('-', '')]
                    if key in intclass_d: 
                        intclass_d[key].append(value.replace('-', ''))
    return intclass_d

In [61]:
ret_d = getData(retrieved)
map_d = getData(mapped)

#### Most unmapped

In [80]:
sizes = []
labels = []

for key, value in map_d.items():
    values = map_d[key]
    unmapped_list = np.setdiff1d(ret_d[key],values)
    dist_unmapped = ( len(unmapped_list) / len(ret_d[key]) ) * 100
    sizes.append( dist_unmapped )
    labels.append( key )

In [83]:
df = pd.DataFrame(
    {'percentage': sizes,
     'interventions': labels
    })

df = df.sort_values(by=['percentage'])

In [238]:
import plotly.express as px

fig = px.bar(df.tail(5), x='interventions', y='percentage', title='Distribution of unmapped intervention mentions by class', width = 600,height=500)
fig.update_yaxes(automargin=True)
fig.update_layout(barmode='group', xaxis_tickangle=45)
fig.show()

#### Most mapped

In [85]:
sizes = []
labels = []

for key, value in map_d.items():
    values = map_d[key]
    unmapped_list = np.setdiff1d(ret_d[key],values)
    dist_mapped = ( len(values) / len(ret_d[key]) ) * 100
    sizes.append( dist_mapped )
    labels.append( key )

In [86]:
df_mapped = pd.DataFrame(
    {'percentage': sizes,
     'interventions': labels
    })

df_mapped = df_mapped.sort_values(by=['percentage'])

In [237]:
import plotly.express as px

fig = px.bar(df_mapped.tail(5), x='interventions', y='percentage', title='Distribution of mapped intervention mentions by class', width = 600,height=500)
#fig.update_traces(width=1)
fig.update_yaxes(automargin=True)
fig.update_layout(barmode='group', xaxis_tickangle=45)
fig.show()

### Question 3: length disribution of mapped and unmapped entities Intervention class-wise

In [193]:
mapped_lengths_dict = dict()
unmapped_lengths_dict = dict()
retrieved_lengths_dict = dict()

for key, value in map_d.items():
    values = map_d[key]
    unmapped_list = np.setdiff1d(ret_d[key],values) # Get the unmapped here...
    retrieved_list = ret_d[key]
    
    for eachInt in unmapped_list:
        if 'class' not in key:
            if key not in unmapped_lengths_dict:
                unmapped_lengths_dict[key] = [len(eachInt.split(' '))]
            if key in unmapped_lengths_dict: 
                unmapped_lengths_dict[key].append(len(eachInt.split(' ')))

    for eachInt in values:
        if 'class' not in key:
            if key not in mapped_lengths_dict:
                mapped_lengths_dict[key] = [len(eachInt.split(' '))]
            if key in mapped_lengths_dict: 
                mapped_lengths_dict[key].append(len(eachInt.split(' ')))
                
    for eachInt in retrieved_list:
        if 'class' not in key:
            if key not in retrieved_lengths_dict:
                retrieved_lengths_dict[key] = [len(eachInt.replace('-', ' ').split(' '))]
            if key in retrieved_lengths_dict: 
                retrieved_lengths_dict[key].append(len(eachInt.replace('-', ' ').split(' ')))

In [194]:
for key, value in retrieved_lengths_dict.items():
    lengths = Counter(value)
    most_common = lengths.most_common(1)
    print(key, ' : ', most_common)

drug  :  [(1, 269442)]
other  :  [(2, 45958)]
device  :  [(2, 27283)]
behavioral  :  [(2, 26790)]
dietary supplement  :  [(2, 9581)]
procedure  :  [(2, 24360)]
biological  :  [(1, 23472)]
diagnostic test  :  [(2, 5341)]
radiation  :  [(2, 2913)]
combination product  :  [(3, 617)]
genetic  :  [(2, 1382)]


In [188]:
for key, value in mapped_lengths_dict.items():
    lengths = Counter(value)
    most_common = lengths.most_common(1)
    print(key, ' : ', most_common)

drug  :  [(1, 57354)]
other  :  [(2, 3728)]
device  :  [(1, 3068)]
behavioral  :  [(2, 2404)]
dietary supplement  :  [(1, 1649)]
procedure  :  [(2, 2614)]
biological  :  [(1, 3705)]
diagnostic test  :  [(2, 248)]
radiation  :  [(1, 348)]
combination product  :  [(1, 53)]
genetic  :  [(1, 54)]


In [190]:
for key, value in unmapped_lengths_dict.items():
    lengths = Counter(value)
    most_common = lengths.most_common(1)
    print(key, ' : ', most_common)

drug  :  [(3, 12020)]
other  :  [(2, 6945)]
device  :  [(3, 5629)]
behavioral  :  [(3, 4859)]
dietary supplement  :  [(2, 1498)]
procedure  :  [(3, 3882)]
biological  :  [(1, 2054)]
diagnostic test  :  [(2, 1109)]
radiation  :  [(3, 417)]
combination product  :  [(3, 161)]
genetic  :  [(2, 176)]


### Question 4: length disribution of mapped and unmapped entities

In [123]:
# df = unmapped
# df_mapped = mapped

unmapped_lengths = []
mapped_lengths = []

for key, value in map_d.items():
    values = map_d[key]
    unmapped_list = np.setdiff1d(ret_d[key],values)
    for eachInt in unmapped_list:
        unmapped_lengths.append( len(eachInt.split(' ')) )
    for eachInt in values:
        mapped_lengths.append( len(eachInt.split(' ')) )

In [127]:
unmapped_counter = Counter(unmapped_lengths)
top_unmapped_lengths = unmapped_counter.most_common(5)
print( top_unmapped_lengths )

[(3, 37802), (2, 36293), (4, 26346), (1, 22792), (5, 17932)]


In [202]:
import plotly.graph_objects as go

night_colors = ['rgb(146, 123, 21)', 'rgb(177, 180, 34)', 'rgb(206, 206, 40)',
                'rgb(175, 51, 21)', 'rgb(35, 36, 21)']

labels = []
values = []

for eachTuple in top_unmapped_lengths:
    labels.append( eachTuple[0] )
    values.append( eachTuple[1] )

# pull is given as a fraction of the pie radius
fig = go.Figure( data=[go.Pie(labels=labels, values=values, textinfo='label+percent',
                             insidetextorientation='radial', pull=[0.2, 0, 0.0, 0, 0], marker_colors=night_colors)])

fig.update_layout(title_text='\'\'Intervention\'\' mention length (number of words) distribution of unmapped entities', title_x=0.5 )
fig.update_layout(legend=dict({'traceorder': 'normal'}), legend_title_text='\'\'Intervention\'\' mention length', legend_x=0.8)
fig.show()

In [203]:
mapped_counter = Counter(mapped_lengths)
top_mapped_lengths = mapped_counter.most_common(5)
print( top_mapped_lengths )

[(1, 72991), (2, 23705), (3, 11916), (4, 5894), (5, 2639)]


In [204]:
night_colors = ['rgb(146, 123, 21)', 'rgb(177, 180, 34)', 'rgb(206, 206, 40)',
                'rgb(175, 51, 21)', 'rgb(35, 36, 21)']

labels_ = []
values_ = []

for eachTuple in top_mapped_lengths:
    labels_.append( eachTuple[0] )
    values_.append( eachTuple[1] )

# pull is given as a fraction of the pie radius
fig = go.Figure( data=[go.Pie(labels=labels_, values=values_, textinfo='label+percent',
                             insidetextorientation='radial', pull=[0.2, 0, 0.0, 0, 0], marker_colors=night_colors)])
fig.update_layout(title_text='\'\'Intervention\'\' mention length (number of words) distribution of mapped entities', title_x=0.5 )
fig.update_layout(legend=dict({'traceorder': 'normal'}), legend_title_text='\'\'Intervention\'\' mention length', legend_x=0.8)
fig.show()

In [229]:
# Create subplots: use 'domain' type for Pie subplot
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels_, values=values_, name='XYZ', insidetextorientation='radial', textinfo='label+percent', pull=[0.2, 0, 0.0, 0, 0]), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=values, name='ABC', insidetextorientation='radial',  textinfo='label+percent', pull=[0.2, 0, 0.0, 0, 0]), 1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hoverinfo="label+percent+name")
fig.update_traces(textfont_size=15)
fig.update_layout(legend=dict({'traceorder': 'normal'}), legend_title_text='mention length', legend_x=0.45)

fig.update_layout(
    title_text='Distribution of phrase lengths for the Intervention mentions mapped vs. unmapped')

fig.show()

### Question 5: Ratio of seen vs unseen surface forms

In [300]:
#Test sets
ebm_gold_test = '/home/anjani/systematicReviews/data/TA_screening/EBM_NLP/ebmnlpgold_sentence_annotation2POS.txt'
hilfiker_test = '/home/anjani/systematicReviews/data/TA_screening/EBM_NLP/hilfiker_sentence_annotation2POS.txt'

In [301]:
#Training sets
ebm_pico = '/home/anjani/systematicReviews/data/TA_screening/EBM_NLP/sentence_annotation2POS.txt'
distant_pico = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/intervention_data_preprocessed/conf_09/extraction1_pos_posnegtrail.txt'

In [302]:
def getTestTokens(input_file):
    
    surfacetokens = []
    
    with open(input_file, 'r') as test1f:
        for eachline in test1f:
            annot = json.loads(eachline)
            abstract_identifier = annot.keys()

            for eachKey in abstract_identifier:
                all_sentences = annot[eachKey]

                for eachSentenceKey in all_sentences.keys():

                    assert len(all_sentences[eachSentenceKey][0]) == len(all_sentences[eachSentenceKey][1])

                    tokens = all_sentences[eachSentenceKey][0]
                    annotations = all_sentences[eachSentenceKey][1]

                    for token, annotation in zip(tokens, annotations):
                        if annotation == '1':
                            surfacetokens.append(token.lower())
                            
    return surfacetokens

In [303]:
ebm_gold_surface_tokens = getTestTokens(ebm_gold_test)
ebm_gold_surface_tokens = set(ebm_gold_surface_tokens)
print('Number of unique surface token for annotation from the EBM-PICO gold test set: ', len(ebm_gold_surface_tokens))

hilfiker_surface_tokens = getTestTokens(hilfiker_test)
hilfiker_surface_tokens = set(hilfiker_surface_tokens)
print('Number of unique surface token for annotation from the hilfiker test set: ', len(hilfiker_surface_tokens))

Number of unique surface token for annotation from the EBM-PICO gold test set:  1043
Number of unique surface token for annotation from the hilfiker test set:  740


In [304]:
ebmpico_surface_tokens = getTestTokens(ebm_pico)
ebmpico_surface_tokens = set(ebmpico_surface_tokens)
print('Number of unique surface token for annotation from the EBM-PICO training set: ', len(ebmpico_surface_tokens))

Number of unique surface token for annotation from the EBM-PICO training set:  14839


In [298]:
def getDistantTokens(input_file):
    surfacetokens = []
    counter = 0
    with open(input_file, 'r') as test1f:
        for eachLine in test1f:
            if len(eachLine) > 20:
                annot = json.loads(eachLine)
                id_ = annot['id']

                # Check if aggregate annotations are present in the json
                if 'aggregate_annot' in annot.keys():

                    # Read official title
                    if 'official_title_pos' in annot['aggregate_annot'].keys():
                        assert len(annot['aggregate_annot']['official_title']) == len(annot['aggregate_annot']['official_title_annot'])
                        labels =  annot['aggregate_annot']['official_title_annot']
                        tokens = annot['aggregate_annot']['official_title']

                        for label, token in zip(labels, tokens):
                            if label == 1:
                                surfacetokens.append( token)

                    if 'brief_title' in annot['aggregate_annot'].keys():
                        assert len(annot['aggregate_annot']['brief_title']) == len(annot['aggregate_annot']['brief_title_annot'])
                        labels =  annot['aggregate_annot']['brief_title_annot']
                        tokens = annot['aggregate_annot']['brief_title']

                        for label, token in zip(labels, tokens):
                            if label == 1:
                                surfacetokens.append( token )

                    if 'brief_summary_annot' in annot['aggregate_annot'].keys():
                        # iterate the dictionary
                        for eachKey, eachValue in annot['aggregate_annot']['brief_summary_annot'].items():
                            assert len(eachValue) == 3
                            raw_labels =  eachValue[1]
                            tokens = eachValue[0]
                            labels = raw_labels

                            for label, token in zip(labels, tokens):
                                if label == 1:
                                    surfacetokens.append( token )

                    if 'detailed_description_annot' in annot['aggregate_annot'].keys():
                        # iterate the dictionary
                        for eachKey, eachValue in annot['aggregate_annot']['detailed_description_annot'].items():
                            assert len(eachValue) == 3
                            raw_labels =  eachValue[1]
                            tokens = eachValue[0]
                            labels = raw_labels

                            for label, token in zip(labels, tokens):
                                if label == 1:
                                    surfacetokens.append( token )

                    if 'intervention_description_annot' in annot['aggregate_annot'].keys():
                        # iterate the dictionary
                        for eachKey, eachValue in annot['aggregate_annot']['intervention_description_annot'].items():
                            assert len(eachValue) == 3
                            raw_labels =  eachValue[1]
                            tokens = eachValue[0]
                            labels = raw_labels

                            for label, token in zip(labels, tokens):
                                if label == 1:
                                    surfacetokens.append( token )
                
    return surfacetokens

In [299]:
distant_pico_surface_tokens = getDistantTokens(distant_pico)
distant_pico_surface_tokens = set(distant_pico_surface_tokens)
print('Number of unique surface token for annotation from the DISTANT-CTO training set: ', len(distant_pico_surface_tokens) )

Number of unique surface token for annotation from the DISTANT-CTO training set:  98358


In [339]:
strong_weak_combined = list(distant_pico_surface_tokens) + list(ebmpico_surface_tokens)
strong_weak_combined = set(strong_weak_combined)
strong_weak_combined = list(strong_weak_combined)

### Test 1

In [346]:
# yields the elements in `list_2` that are NOT in `list_1`
unseen_tokens = np.setdiff1d( list(ebm_gold_surface_tokens) , list(ebmpico_surface_tokens) ) # (list2, list1)
print('Number of unseen tokens in the ebm-pico training: ', len(unseen_tokens))
print('Number of total tokens: ', len(ebm_gold_surface_tokens))
print( len(unseen_tokens) / len(ebm_gold_surface_tokens) * 100 )

# yields the elements in `list_2` that are NOT in `list_1`
unseen_tokens = np.setdiff1d( list(ebm_gold_surface_tokens) , list(distant_pico_surface_tokens) ) # (list2, list1)
print('Number of unseen tokens in the distant-pico training: ', len(unseen_tokens))
print('Number of total tokens: ', len(ebm_gold_surface_tokens))
print( len(unseen_tokens) / len(ebm_gold_surface_tokens) * 100 )


# yields the elements in `list_2` that are NOT in `list_1`
unseen_tokens = np.setdiff1d( list(ebm_gold_surface_tokens) , list(strong_weak_combined) ) # (list2, list1)
print('Number of unseen tokens in the combined annotations: ', len(unseen_tokens))
print('Number of total tokens: ', len(ebm_gold_surface_tokens))
print( len(unseen_tokens) / len(ebm_gold_surface_tokens) * 100 )

Number of unseen tokens in the ebm-pico training:  289
Number of total tokens:  1043
27.708533077660597
Number of unseen tokens in the distant-pico training:  223
Number of total tokens:  1043
21.380632790028763
Number of unseen tokens in the combined annotations:  170
Number of total tokens:  1043
16.29913710450623


### Test 2

In [347]:
# yields the elements in `list_2` that are NOT in `list_1`
unseen_tokens = np.setdiff1d( list(hilfiker_surface_tokens) , list(ebmpico_surface_tokens) ) # (list2, list1)
print('Number of unseen tokens in the ebm-pico training: ', len(unseen_tokens))
print('Number of total tokens: ', len(hilfiker_surface_tokens))
print( len(unseen_tokens) / len(hilfiker_surface_tokens) * 100 )

# yields the elements in `list_2` that are NOT in `list_1`
unseen_tokens = np.setdiff1d( list(hilfiker_surface_tokens) , list(distant_pico_surface_tokens) ) # (list2, list1)
print('Number of unseen tokens in the distant-pico training: ', len(unseen_tokens))
print('Number of total tokens: ', len(hilfiker_surface_tokens))
print( len(unseen_tokens) / len(hilfiker_surface_tokens) * 100 )


# yields the elements in `list_2` that are NOT in `list_1`
unseen_tokens = np.setdiff1d( list(hilfiker_surface_tokens) , list(strong_weak_combined) ) # (list2, list1)
print('Number of unseen tokens in the combined annotations: ', len(unseen_tokens))
print('Number of total tokens: ', len(hilfiker_surface_tokens))
print( len(unseen_tokens) / len(hilfiker_surface_tokens) * 100 )

Number of unseen tokens in the ebm-pico training:  202
Number of total tokens:  740
27.297297297297295
Number of unseen tokens in the distant-pico training:  170
Number of total tokens:  740
22.972972972972975
Number of unseen tokens in the combined annotations:  112
Number of total tokens:  740
15.135135135135137


## Question 6: Drug vs. Non-Drug token-level mentions

In [348]:
# Intervention labels for sub-classes of intervention
# 0 = no label, 3 = Drug, rest = 1,2,4,5,6,7 are non-drugs

In [350]:
# Open and list all the files from a directory
from os import listdir
from os.path import isfile, join

int_subclasses_dir = '/mnt/nas2/data/systematicReview/ebm_nlp_2_00/annotations/aggregated/hierarchical_labels/interventions/train'
int_files = [f for f in listdir(int_subclasses_dir) if isfile(join(int_subclasses_dir, f))]

3278754.AGGREGATED.ann


In [503]:
# In the EBM-PICO dataset
drug_mention_counter = 0
nondrug_mention_counter = 0
no_mention_counter = 0

for eachFile in int_files:
    fullPath = int_subclasses_dir + '/' + eachFile
    annotations = open(fullPath, 'r').read().split('\n')
    contiguous_preds = list(mit.run_length.encode(annotations))
    for eachPrediction in contiguous_preds:
        if eachPrediction[0] != '0' and eachPrediction[0] == '3':
            drug_mention_counter = drug_mention_counter + 1
        elif eachPrediction[0] != '0' and eachPrediction[0] != '3':
            nondrug_mention_counter = nondrug_mention_counter + 1
        else:
            no_mention_counter = no_mention_counter + 1
    
print('------------------------------------------')
print(drug_mention_counter)
print(nondrug_mention_counter)
print('combined: ', drug_mention_counter + nondrug_mention_counter)
print(no_mention_counter)

------------------------------------------
18908
13982
combined:  32890
35861


In [504]:
print('Percentage of drug mentions in the ebm-pico dataset: ', (drug_mention_counter/(drug_mention_counter + nondrug_mention_counter))*100)
print('Percentage of drug mentions in the ebm-pico dataset: ', (nondrug_mention_counter/ (drug_mention_counter + nondrug_mention_counter))*100)

Percentage of drug mentions in the ebm-pico dataset:  57.48859835816358
Percentage of drug mentions in the ebm-pico dataset:  42.51140164183643


In [491]:
def fetchAnnot(data):
    
    annotations_all = []
    
    if 'brief_summary_annot' in data.keys():
        # iterate the dictionary
        for eachLine in data['brief_summary_annot']:
            for eacheachLine in eachLine:
                annotations = eachLine[eacheachLine][1]
                annotations_all.extend( annotations )
                
    if 'detailed_description_annot' in data.keys():
        # iterate the dictionary
        for eachLine in data['detailed_description_annot']:
            for eacheachLine in eachLine:
                annotations = eachLine[eacheachLine][1]
                annotations_all.extend( annotations )
                
    if 'interventionDescription_annot' in data.keys():
        # iterate the dictionary
        for eachLine in data['interventionDescription_annot']:
            for eacheachLine in eachLine:
                annotations = eachLine[eacheachLine][1]
                annotations_all.extend( annotations )
                
    if 'official_title_annot' in data.keys():
        annotations = data['official_title_annot']
        annotations_all.extend( annotations )
        
    if 'brief_title_annot' in data.keys():
        annotations = data['brief_title_annot']
        annotations_all.extend( annotations )
        
    return annotations_all

In [496]:
def getAnnot(data, choice):
    
    annotations_all = []

    annotations = fetchAnnot(data)
    annotations_all.extend( annotations )
    
    all_values = [value for key,value in data.items() if key.startswith("syn_")]
    if all_values:
        annotations_syn = fetchAnnot(all_values[0])
        annotations_all.extend( annotations_syn )

    return annotations_all

In [497]:
# In the DISTANT-CTO dataset
counter_stop = 0
drug_distant_counter = 0
nondrug_distant_counter = 0

disntatfile = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/intervention_data_preprocessed/conf_09/extraction1_pos_posnegtrail.txt'
with open(disntatfile, 'r') as distant_file:
    for eachLine in distant_file:
        counter_stop = counter_stop + 1
        if len(eachLine) > 20:
            annot = json.loads(eachLine)
            id_ = annot['id']
            # Check if aggregate annotations are present in the json
            if 'extraction1' in annot.keys():
                extraction1 = annot['extraction1']
                for eachIntervention in extraction1.items():
                    typeInt = eachIntervention[1]['intervention_type']
                    if 'drug' in typeInt:
                        annotations = getAnnot(eachIntervention[1], 'drug')
                        
                        contiguous_preds = list(mit.run_length.encode(annotations))
                        for eachPrediction in contiguous_preds:
                            if eachPrediction[0] != 0 and eachPrediction[0] == 1:
                                drug_distant_counter = drug_distant_counter + 1
                    else:
                        # other intervention
                        annotations = getAnnot(eachIntervention[1], 'nodrug')
                        
                        contiguous_preds = list(mit.run_length.encode(annotations))
                        for eachPrediction in contiguous_preds:
                            if eachPrediction[0] != 0 and eachPrediction[0] == 1:
                                nondrug_distant_counter = nondrug_distant_counter + 1
            
print( drug_distant_counter )
print( nondrug_distant_counter )
print(drug_distant_counter + nondrug_distant_counter)

748946
501244
1250190


In [500]:
print('Percentage of drug mentions in the distant-cto dataset: ', (drug_distant_counter/(drug_distant_counter + nondrug_distant_counter))*100)
print('Percentage of drug mentions in the distant-cto dataset: ', (nondrug_distant_counter/ (drug_distant_counter + nondrug_distant_counter))*100)

Percentage of drug mentions in the distant-cto dataset:  59.90657420072149
Percentage of drug mentions in the distant-cto dataset:  40.093425799278506
