In [1]:
import pickle
import numpy as np
import re

from data_preparation.EMA_documents import SectionLeaflet, Leaflet
from helpers.entity_recognition import highlight_entities, unique_entities

In [2]:
# load array of objects, where object - class Leaflet
with open("data_preparation/LEAFLET_DATASET_PROCESSED_NER.pickle", "rb") as f:
    package_leaflets_NER = pickle.load(f)

In [3]:
# load array of objects, where object - class Leaflet
with open("data_preparation/LEAFLET_DATASET_PROCESSED_InferICD10CM.pickle", "rb") as f:
    package_leaflets_ICD10CM = pickle.load(f)

In [4]:
# load array of objects, where object - class Leaflet
with open("data_preparation/LEAFLET_DATASET_PROCESSED_InferRxNorm.pickle", "rb") as f:
    package_leaflets_RxNorm = pickle.load(f)

In [5]:
# load array of objects, where object - class Leaflet
with open("data_preparation/LEAFLET_DATASET_PROCESSED_Stanza.pickle", "rb") as f:
    package_leaflets_Stanza = pickle.load(f)

In [6]:
assert len(package_leaflets_NER) == len(package_leaflets_ICD10CM)
assert len(package_leaflets_NER) == len(package_leaflets_RxNorm)
assert len(package_leaflets_NER) == len(package_leaflets_Stanza)

#### Convert Stanza NER to a suitable format

In [7]:
def format_stanza(package_leaflets_Stanza):
    """ Stanza entity not a dict but - <class 'stanza.models.common.doc.Span'> """
    
    for leaflet_index, leaflet in enumerate(package_leaflets_Stanza):
    
        current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                    leaflet.section3, leaflet.section4, 
                                    leaflet.section5, leaflet.section6]
    
        for section_index, current_section in enumerate(current_leaflet_sections):
        
            if current_section.section_content is None:
                current_section.entity_recognition = []
                continue

            ### set empty section content to None
            if len(current_section.section_content) <= 1:
                current_section.section_content = None
                current_section.entity_recognition = []
                continue
            
            if current_section.entity_recognition is None:
                current_section.entity_recognition = []
                continue
            
            if len(current_section.entity_recognition) < 1:
                current_section.entity_recognition = []
                continue
            
            
            formatted_stanza_NER = []

            for entity in current_section.entity_recognition:

                formatted_entity = {'Text':entity.text, 'Type':entity.type, 'BeginOffset':entity.start_char, 'EndOffset':entity.end_char}
                formatted_stanza_NER.append(formatted_entity)
            
            # update
            current_section.entity_recognition = formatted_stanza_NER

In [8]:
format_stanza(package_leaflets_Stanza)

In [9]:
### Preprocess 2 - replace None values with empty
def replace_None_entities(leaflets_NER):
    
    for leaflet_index, leaflet in enumerate(leaflets_NER):
        
        current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                    leaflet.section3, leaflet.section4, 
                                    leaflet.section5, leaflet.section6]

        for section_index, current_section in enumerate(current_leaflet_sections):

            if current_section.section_content is None:
                current_section.entity_recognition = []
                continue

            ### set empty section content to None
            if len(current_section.section_content) < 1:
                current_section.section_content = None
                current_section.entity_recognition = []
                continue
            
            if current_section.entity_recognition is None:
                current_section.entity_recognition = []
                continue
            
            if len(current_section.entity_recognition) < 1:
                current_section.entity_recognition = []
                continue

In [10]:
replace_None_entities(package_leaflets_NER)
replace_None_entities(package_leaflets_ICD10CM)
replace_None_entities(package_leaflets_RxNorm)

## Merge multiple NER into one NER

In [11]:
# save merged NER for each section in package_leaflets_final
with open("data_preparation/LEAFLET_DATASET_PROCESSED.pickle", "rb") as f:
    package_leaflets_final = pickle.load(f)

### Strategy
 
Add new entities one-by-one to existing entities (while adding make sure it is indeed a new entity)     
Sort all entities by 'BeginOffset'       
Save merged NER in package_leaflets_final  

In [12]:
def detect_entities_digits(section_content):
    """
    
    Source - https://stackoverflow.com/questions/4289331/how-to-extract-numbers-from-a-string-in-python/4289415#4289415
           - https://stackoverflow.com/questions/4664850/how-to-find-all-occurrences-of-a-substring
    """
    
    # return empty if section_content is None
    if section_content is None:
        return []
    
    # find all digits in seciton_content
    digits = re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", section_content)
    
    # save digit as tuple (digit, BeginOffset, EndOffset)
    digits_entities = set()
    
    # simple tokenization (by " ") - better than - wordpunct_tokenize(section_content)
    tokenized_section = section_content.split()
    
    for token in tokenized_section:
        
        # if token is indeed a digit 
        if token in digits:
            
            # find all occurrences of a token in section_content
            all_token_occurrences = [m.start() for m in re.finditer(token, section_content)]
            
            for occurrence in all_token_occurrences:
                # occurrence - index where current token(digit) starts in text
                left_char = occurrence - 1
                right_char = occurrence + len(token)

                # token(digit) has to be surrounded by whitespaces (ideally) or any punctuations from left and right
                try:
                    if not section_content[left_char].isalpha() and not section_content[right_char].isalpha() and not section_content[left_char].isdigit() and not section_content[right_char].isdigit():
                        # that's a digit - save it
                        digits_entities.add((token, occurrence, right_char))
                except:
                    pass
        
        # case when there is a punctuation symbol after the digit '122,' (address the limitation of simple tokenization)
        elif token[:-1] in digits:
            # remove the last punctuation symbol from token(digit) and do same steps as above
            token = token[:-1]
            
            all_token_occurrences = [m.start() for m in re.finditer(token, section_content)]

            for occurrence in all_token_occurrences:
                # occurrence - index where current digit starts in text
                left_char = occurrence - 1
                right_char = occurrence + len(token)

                # basically accept whitespaces + punctuiations
                try:
                    if not section_content[left_char].isalpha() and not section_content[right_char].isalpha() and not section_content[left_char].isdigit() and not section_content[right_char].isdigit():
                        # that's a digit - save it
                        digits_entities.add((token, occurrence, right_char))
                except:
                    pass
    
    return digits_entities

In [13]:
def _sort_key(entity):
    return entity['BeginOffset']

In [14]:
def merge_NERs(NER_original, NER_additional):
    
    # no need to check for None, since in processing replaces with []
    
    # default NER
    NER_final = NER_original.copy()
    
    # lookup table
    entity_positions = dict()
    for entity in NER_final:
        entity_positions[(entity['BeginOffset'], entity['EndOffset'])] = entity
    
    # now add additional NERs
    for new_entity in NER_additional:
        
        new_entity_text = new_entity['Text']
        new_entity_start = new_entity['BeginOffset']
        new_entity_end = new_entity['EndOffset']
        
        # find identical - skip
        if (new_entity_start, new_entity_end) in entity_positions:
            continue
        
        # make sure candidate_entity (new_entity) does not overlap with other entities (alrady existing ones)
        is_overlapping = False
        
        # check for overlapping with existing entities
        for existing_entity in NER_original:
            existing_entity_start = existing_entity['BeginOffset']
            existing_entity_end = existing_entity['EndOffset']
            
            # candidate entity (new entity) is overlapping with existing entities by either 'BeginOffset' or 'EndOffset'
            # do not add candidate entity (new entity) - keep the existing one
            if new_entity_start in range(existing_entity_start, existing_entity_end+1) or new_entity_end in range(existing_entity_start, existing_entity_end+1):
                is_overlapping = True
            
            # favor longer entities
            if new_entity_start == existing_entity_start and new_entity_end > existing_entity_end:
                NER_final.remove(existing_entity)
                NER_final.append(new_entity)
                is_overlapping = True
                continue
            
            if new_entity_end == existing_entity_end and new_entity_start < existing_entity_start:
                NER_final.remove(existing_entity)
                NER_final.append(new_entity)
                is_overlapping = True
                continue
                
            # case when a candidate entity (new_entity) is "wider" than already existing one
            # keep longer NER than shorter (basically - favor Stanza NER over AWS)
            if existing_entity_start in range(new_entity_start, new_entity_end+1) and existing_entity_end in range(new_entity_start, new_entity_end+1):
                NER_final.remove(existing_entity)
                NER_final.append(new_entity)
                is_overlapping = True
                continue
            
            # note: do not care if candidate entity is inside the existing one
    
        # new entity has no overlap with existing entities, just add new separate entity
        if not is_overlapping: NER_final.append(new_entity)
        
    # sort entities by 'BeginOffset'
    NER_final = sorted(NER_final, key=_sort_key)
    
    return NER_final

In [15]:
def merge_digits_NERs(updated_entities, digits_NER):
    
    
    # add digits_NER to the combined NERs
    for digit_entity in digits_NER:
        digit_text = str(digit_entity[0])
        digit_start = digit_entity[1]
        digit_end = digit_entity[2]
        
        entity_digit = {'Text':digit_text, 'Type':'NUMBER', 'BeginOffset':digit_start, 'EndOffset':digit_end}
        
        updated_entities.append(entity_digit)
    
    # sort entities by 'BeginOffset'
    updated_entities = sorted(updated_entities, key=_sort_key)
    
    return updated_entities

In [16]:
def remove_duplicate_entities(updated_entities):
        
    # lookup table for unique entities
    entity_positions = dict()
    
    # remove duplicates
    for entity_ind in range(len(updated_entities)):
        entity_text = updated_entities[entity_ind]['Text']
        entity_start = updated_entities[entity_ind]['BeginOffset']
        entity_end = updated_entities[entity_ind]['EndOffset']
        
        if (entity_start, entity_end) in entity_positions:
            # that's a duplicate - set entity to None
            updated_entities[entity_ind] = None
        else:
            entity_positions[(entity_start, entity_end)] = updated_entities[entity_ind]
    
    # now get rid off 'None'
    updated_entities_final = [entity for entity in updated_entities if entity is not None]
    
    # sort entities by 'BeginOffset'
    updated_entities_final = sorted(updated_entities_final, key=_sort_key)
    
    return updated_entities_final

In [17]:
def remove_overlapping_entities(updated_entities):
    
    # favor "longer" entities when dealing with overlapping     
    
    assert sorted(updated_entities, key=_sort_key) == updated_entities
    
    for curr_ind in range(len(updated_entities)):
        
        if updated_entities[curr_ind] is None: continue
        
        curr_entity_start = updated_entities[curr_ind]['BeginOffset']
        curr_entity_end = updated_entities[curr_ind]['EndOffset']
        
        for next_ind in range(curr_ind+1, len(updated_entities)):
            
            if updated_entities[next_ind] is None: continue
            
            next_entity_start = updated_entities[next_ind]['BeginOffset']
            next_entity_end = updated_entities[next_ind]['EndOffset']
            
            # find overlapping entities
            # entities must be separate
            if curr_entity_start in range(next_entity_start, next_entity_end+1) or curr_entity_end in range(next_entity_start, next_entity_end+1):
                # keep the longer entity in case of overlapping
                range_curr = curr_entity_end - curr_entity_start
                range_next = next_entity_end - next_entity_start
                
                if range_next > range_curr:
                    # todo - remove curr
                    updated_entities[curr_ind] = None
                else:
                    # todo - remove next
                    updated_entities[next_ind] = None
    
    # now get rid off 'None'
    updated_entities_final = [entity for entity in updated_entities if entity is not None]
    
    # sort entities by 'BeginOffset'
    updated_entities_final = sorted(updated_entities_final, key=_sort_key)
    
    return updated_entities_final

### Main code

In [18]:
for leaflet_idx in range(len(package_leaflets_final)):
    
    for section_idx in range(1,7):
        
        if section_idx == 1:
            current_section_content = package_leaflets_NER[leaflet_idx].section1.section_content
            original_NER = package_leaflets_NER[leaflet_idx].section1.entity_recognition.copy()
            icd10cm_NER = package_leaflets_ICD10CM[leaflet_idx].section1.entity_recognition.copy()
            rxnorm_NER = package_leaflets_RxNorm[leaflet_idx].section1.entity_recognition.copy()
            stanza_NER = package_leaflets_Stanza[leaflet_idx].section1.entity_recognition.copy()
        
        elif section_idx == 2:
            current_section_content = package_leaflets_NER[leaflet_idx].section2.section_content
            original_NER = package_leaflets_NER[leaflet_idx].section2.entity_recognition.copy()
            icd10cm_NER = package_leaflets_ICD10CM[leaflet_idx].section2.entity_recognition.copy()
            rxnorm_NER = package_leaflets_RxNorm[leaflet_idx].section2.entity_recognition.copy()
            stanza_NER = package_leaflets_Stanza[leaflet_idx].section2.entity_recognition.copy()
        
        elif section_idx == 3:
            current_section_content = package_leaflets_NER[leaflet_idx].section3.section_content
            original_NER = package_leaflets_NER[leaflet_idx].section3.entity_recognition.copy()
            icd10cm_NER = package_leaflets_ICD10CM[leaflet_idx].section3.entity_recognition.copy()
            rxnorm_NER = package_leaflets_RxNorm[leaflet_idx].section3.entity_recognition.copy()
            stanza_NER = package_leaflets_Stanza[leaflet_idx].section3.entity_recognition.copy()
        
        elif section_idx == 4:
            current_section_content = package_leaflets_NER[leaflet_idx].section4.section_content
            original_NER = package_leaflets_NER[leaflet_idx].section4.entity_recognition.copy()
            icd10cm_NER = package_leaflets_ICD10CM[leaflet_idx].section4.entity_recognition.copy()
            rxnorm_NER = package_leaflets_RxNorm[leaflet_idx].section4.entity_recognition.copy()
            stanza_NER = package_leaflets_Stanza[leaflet_idx].section4.entity_recognition.copy()
        
        elif section_idx == 5:
            current_section_content = package_leaflets_NER[leaflet_idx].section5.section_content
            original_NER = package_leaflets_NER[leaflet_idx].section5.entity_recognition.copy()
            icd10cm_NER = package_leaflets_ICD10CM[leaflet_idx].section5.entity_recognition.copy()
            rxnorm_NER = package_leaflets_RxNorm[leaflet_idx].section5.entity_recognition.copy()
            stanza_NER = package_leaflets_Stanza[leaflet_idx].section5.entity_recognition.copy()
        
        elif section_idx == 6:
            current_section_content = package_leaflets_NER[leaflet_idx].section6.section_content
            original_NER = package_leaflets_NER[leaflet_idx].section6.entity_recognition.copy()
            icd10cm_NER = package_leaflets_ICD10CM[leaflet_idx].section6.entity_recognition.copy()
            rxnorm_NER = package_leaflets_RxNorm[leaflet_idx].section6.entity_recognition.copy()
            stanza_NER = package_leaflets_Stanza[leaflet_idx].section6.entity_recognition.copy()
        
        
        # default NER - original_NER
        # merge other entities to default one-by-one
        updated_entities = merge_NERs(original_NER, icd10cm_NER)
        updated_entities = merge_NERs(updated_entities, rxnorm_NER)
        updated_entities = merge_NERs(updated_entities, stanza_NER)
        
        # detect digits as entities in current section_content
        try:
            digits_NER = detect_entities_digits(current_section_content)
        except:
            digits_NER = []
        
        # merge digits-entities
        updated_entities = merge_digits_NERs(updated_entities, digits_NER)
        
        # remove duplicate entities
        updated_entities_final = remove_duplicate_entities(updated_entities)
        
        # remove overlapping entities
        updated_entities_final = remove_overlapping_entities(updated_entities_final)
        
        # save update entities
        if section_idx == 1: package_leaflets_final[leaflet_idx].section1.entity_recognition = updated_entities_final
        elif section_idx == 2: package_leaflets_final[leaflet_idx].section2.entity_recognition = updated_entities_final
        elif section_idx == 3: package_leaflets_final[leaflet_idx].section3.entity_recognition = updated_entities_final
        elif section_idx == 4: package_leaflets_final[leaflet_idx].section4.entity_recognition = updated_entities_final
        elif section_idx == 5: package_leaflets_final[leaflet_idx].section5.entity_recognition = updated_entities_final
        elif section_idx == 6: package_leaflets_final[leaflet_idx].section6.entity_recognition = updated_entities_final


In [19]:
# save results
with open("datasets/LEAFLET_DATASET_PROCESSED_NER_COMBINED.pickle", "wb") as f:
    pickle.dump(package_leaflets_final, f)

### How many new entities did we add?

In [20]:
COUNT_NER_BEFORE = {'1':0, '2':0, '3':0, '4':0, '5':0, '6':0}
COUNT_NER_AFTER = {'1':0, '2':0, '3':0, '4':0, '5':0, '6':0}

for leaflet_idx in range(len(package_leaflets_NER)):
    
    for section_idx in range(1,7):
        
        if section_idx == 1:
            for entity in package_leaflets_NER[leaflet_idx].section1.entity_recognition:
                COUNT_NER_BEFORE['1'] += 1
            for entity in package_leaflets_final[leaflet_idx].section1.entity_recognition:
                COUNT_NER_AFTER['1'] += 1
            
        elif section_idx == 2:
            for entity in package_leaflets_NER[leaflet_idx].section2.entity_recognition:
                COUNT_NER_BEFORE['2'] += 1
            for entity in package_leaflets_final[leaflet_idx].section2.entity_recognition:
                COUNT_NER_AFTER['2'] += 1
            
        elif section_idx == 3:
            for entity in package_leaflets_NER[leaflet_idx].section3.entity_recognition:
                COUNT_NER_BEFORE['3'] += 1
            for entity in package_leaflets_final[leaflet_idx].section3.entity_recognition:
                COUNT_NER_AFTER['3'] += 1
            
        elif section_idx == 4:
            for entity in package_leaflets_NER[leaflet_idx].section4.entity_recognition:
                COUNT_NER_BEFORE['4'] += 1
            for entity in package_leaflets_final[leaflet_idx].section4.entity_recognition:
                COUNT_NER_AFTER['4'] += 1
            
        elif section_idx == 5:
            for entity in package_leaflets_NER[leaflet_idx].section5.entity_recognition:
                COUNT_NER_BEFORE['5'] += 1
            for entity in package_leaflets_final[leaflet_idx].section5.entity_recognition:
                COUNT_NER_AFTER['5'] += 1
            
        elif section_idx == 6:
            for entity in package_leaflets_NER[leaflet_idx].section6.entity_recognition:
                COUNT_NER_BEFORE['6'] += 1
            for entity in package_leaflets_final[leaflet_idx].section6.entity_recognition:
                COUNT_NER_AFTER['6'] += 1

In [21]:
print(COUNT_NER_BEFORE)
print(COUNT_NER_AFTER)

{'1': 31860, '2': 148517, '3': 35217, '4': 200545, '5': 1007, '6': 22036}
{'1': 37366, '2': 165950, '3': 65039, '4': 175840, '5': 7441, '6': 49206}


Before removing overlapping:       

{'1': 31860, '2': 148517, '3': 35217, '4': 200545, '5': 1007, '6': 22036}    
{'1': 38692, '2': 173954, '3': 67761, '4': 194136, '5': 7561, '6': 51521}    

Before removing duplicate entities   
{'1': 31860, '2': 148517, '3': 35217, '4': 200545, '5': 1007, '6': 22036}   
{'1': 42324, '2': 192028, '3': 71057, '4': 231857, '5': 7643, '6': 52584}    

** Interesting observation**: NER in section4 decreased... let's see why at the end of the notebook   



### Test the order of entities and duplicate entities

In [22]:
def test_order_entities(section_entities):
    
    sorted_entities = sorted(section_entities, key=_sort_key)
    
    if sorted_entities == section_entities:
        return True
    
    return False


def test_duplicate_entities(section_entities):
    
    """
    Duplicate if for multiple entities have same 'BeginOffset' and 'EndOffset'
    """
    
    COUNT_DUPLICATES = 0
    
    # lookup table
    entity_positions = dict()
    
    for entity in section_entities:
        
        entity_text = entity['Text']
        entity_start = entity['BeginOffset']
        entity_end = entity['EndOffset']
        
        if (entity_start, entity_end) in entity_positions:
            COUNT_DUPLICATES += 1
            
            # print(entity_start, entity_end, entity_text)
            # print(entity_positions[(entity_start, entity_end)])
        else:
            entity_positions[(entity_start, entity_end)] = entity
    
    return COUNT_DUPLICATES


def test_overlapping_entities(section_entities):
    
    """
    
    """
    
    assert test_order_entities(section_entities) == True
    
    COUNT_OVERLAPS = 0
    
    for curr_ind in range(len(section_entities)):
        curr_entity_start = section_entities[curr_ind]['BeginOffset']
        curr_entity_end = section_entities[curr_ind]['EndOffset']
        
        for next_ind in range(curr_ind+1, len(section_entities)):
            next_entity_start = section_entities[next_ind]['BeginOffset']
            next_entity_end = section_entities[next_ind]['EndOffset']
            
            # check for overlapping now
            # entities must be separate
            if curr_entity_start in range(next_entity_start, next_entity_end+1) or curr_entity_end in range(next_entity_start, next_entity_end+1):
                COUNT_OVERLAPS += 1
        
    
    return COUNT_OVERLAPS


def test_merging(package_leaflets):
    
    COUNT_NER_TOTAL = 0
    
    COUNT_NER_WRONG_ORDER = 0
    COUNT_NER_DUPLICATE_ENTITIES = 0
    COUNT_NER_OVERLAP_ENTITIES = 0
    
    for leaflet_index, leaflet in enumerate(package_leaflets):
    
        current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                    leaflet.section3, leaflet.section4, 
                                    leaflet.section5, leaflet.section6]
    
        for section_index, current_section in enumerate(current_leaflet_sections):
            
            if not test_order_entities(current_section.entity_recognition):
                COUNT_NER_WRONG_ORDER += 1
            
            COUNT_NER_DUPLICATE_ENTITIES += test_duplicate_entities(current_section.entity_recognition)
            
            COUNT_NER_OVERLAP_ENTITIES += test_overlapping_entities(current_section.entity_recognition)
            
            COUNT_NER_TOTAL += 1
    
    print("Total number of NER: ", COUNT_NER_TOTAL)
    print("Num. of NER with wrong order: ", COUNT_NER_WRONG_ORDER)
    print("Num. of NER with duplicate entities: ", COUNT_NER_DUPLICATE_ENTITIES)
    print("Num. of NER with overlapping entities: ", COUNT_NER_OVERLAP_ENTITIES)

In [23]:
test_merging(package_leaflets_final)

Total number of NER:  8016
Num. of NER with wrong order:  0
Num. of NER with duplicate entities:  0
Num. of NER with overlapping entities:  0


Before removing overlapping:   

Total number of NER:  8016   
Num. of NER with wrong order:  0    
Num. of NER with duplicate entities:  0   
Num. of NER with overlapping entities:  33230    

Before removing duplicates:   

Total number of NER:  8016   
Num. of NER with wrong order:  0   
Num. of NER with duplicate entities:  63868    
Num. of NER with overlapping entities:  63868     

In [24]:
def test_presence_orig_entities(package_leaflets_NER, package_leaflets_final):
    
    COUNT_ORIG_ENTITY_NOT_INCLUDED = 0

    for leaflet_idx in range(len(package_leaflets_NER)):

        for section_idx in range(1,7):

            if section_idx == 1:
                for entity in package_leaflets_NER[leaflet_idx].section1.entity_recognition:
                    orig_entity_text = entity['Text']
                    
                    is_present = False
                    
                    for entity_2 in package_leaflets_final[leaflet_idx].section1.entity_recognition:
                        entity_text = entity_2['Text']
                        
                        if orig_entity_text in entity_text:
                            is_present = True
                    
                    if not is_present: COUNT_ORIG_ENTITY_NOT_INCLUDED += 1
                            
            elif section_idx == 2:
                for entity in package_leaflets_NER[leaflet_idx].section2.entity_recognition:
                    orig_entity_text = entity['Text']
                    
                    is_present = False
                    
                    for entity_2 in package_leaflets_final[leaflet_idx].section2.entity_recognition:
                        entity_text = entity_2['Text']
                        
                        if orig_entity_text in entity_text:
                            is_present = True
                    
                    if not is_present: COUNT_ORIG_ENTITY_NOT_INCLUDED += 1

            elif section_idx == 3:
                for entity in package_leaflets_NER[leaflet_idx].section3.entity_recognition:
                    orig_entity_text = entity['Text']
                    
                    is_present = False
                    
                    for entity_2 in package_leaflets_final[leaflet_idx].section3.entity_recognition:
                        entity_text = entity_2['Text']
                        
                        if orig_entity_text in entity_text:
                            is_present = True
                    
                    if not is_present: COUNT_ORIG_ENTITY_NOT_INCLUDED += 1

            elif section_idx == 4:
                for entity in package_leaflets_NER[leaflet_idx].section4.entity_recognition:
                    orig_entity_text = entity['Text']
                    
                    is_present = False
                    
                    for entity_2 in package_leaflets_final[leaflet_idx].section4.entity_recognition:
                        entity_text = entity_2['Text']
                        
                        if orig_entity_text in entity_text:
                            is_present = True
                    
                    if not is_present: COUNT_ORIG_ENTITY_NOT_INCLUDED += 1

            elif section_idx == 5:
                for entity in package_leaflets_NER[leaflet_idx].section5.entity_recognition:
                    orig_entity_text = entity['Text']
                    
                    is_present = False
                    
                    for entity_2 in package_leaflets_final[leaflet_idx].section5.entity_recognition:
                        entity_text = entity_2['Text']
                        
                        if orig_entity_text in entity_text:
                            is_present = True
                    
                    if not is_present: COUNT_ORIG_ENTITY_NOT_INCLUDED += 1

            elif section_idx == 6:
                for entity in package_leaflets_NER[leaflet_idx].section6.entity_recognition:
                    orig_entity_text = entity['Text']
                    
                    is_present = False
                    
                    for entity_2 in package_leaflets_final[leaflet_idx].section6.entity_recognition:
                        entity_text = entity_2['Text']
                        
                        if orig_entity_text in entity_text:
                            is_present = True
                    
                    if not is_present: COUNT_ORIG_ENTITY_NOT_INCLUDED += 1
    
    return COUNT_ORIG_ENTITY_NOT_INCLUDED

In [25]:
COUNT_ORIG_ENTITY_NOT_INCLUDED = test_presence_orig_entities(package_leaflets_NER, package_leaflets_final)

print('Num. of original entities not included in the combined NER version: ', COUNT_ORIG_ENTITY_NOT_INCLUDED)

Num. of original entities not included in the combined NER version:  1549


** Observation **: with overlapping - 0, when overlapping removed - 1549. Still very small number compared to total number of entities detected in the whole dataset   

### Test merged NERs

In [26]:
import pandas as pd
import numpy as np
import html
import random
from IPython.core.display import display, HTML

def html_escape(text):
    return html.escape(text)


def highlight_entities(text, entities):
    """
    text - string
    entities - dict
    """
    
    for entity in entities:
        
        entity_text = entity['Text']
        
        if entity_text[0].isdigit():
            continue
        
        # number between 0.0 (fully transparent) and 1.0 (fully opaque)
        weight = 1
        
        text_highlighted = '<span style="background-color:rgba(135,206,250,' + str(weight) + ');">' \
                            + html_escape(entity_text) + '</span>'
        
        text = text.replace(entity_text, text_highlighted)
    
    # display detected entities in section content
    display(HTML(text))

In [42]:
highlight_entities(package_leaflets_NER[1000].section2.section_content, package_leaflets_NER[1000].section2.entity_recognition)
print()
highlight_entities(package_leaflets_final[1000].section2.section_content, package_leaflets_final[1000].section2.entity_recognition)

for entity in sorted(package_leaflets_final[1000].section2.entity_recognition, key=_sort_key):
    print(entity['Text'], ' ------ ', entity['Type'], ' -------- ', entity['BeginOffset'], ' -------- ', entity['EndOffset'])




victoza  ------  BRAND_NAME  --------  11  --------  18
allergic  ------  PROBLEM  --------  30  --------  38
liraglutide  ------  GENERIC_NAME  --------  42  --------  53
this medicine  ------  TREATMENT  --------  89  --------  102
victoza  ------  BRAND_NAME  --------  206  --------  213
a disease of the pancreas  ------  PROBLEM  --------  239  --------  264
this medicine  ------  TREATMENT  --------  266  --------  279
type 1 diabetes  ------  DX_NAME  --------  311  --------  326
1  ------  NUMBER  --------  316  --------  317
body  ------  SYSTEM_ORGAN_SITE  --------  333  --------  337
insulin  ------  GENERIC_NAME  --------  359  --------  366
diabetic ketoacidosis  ------  DX_NAME  --------  371  --------  392
complication of diabetes  ------  DX_NAME  --------  396  --------  420
high blood sugar  ------  DX_NAME  --------  426  --------  442
an insulin  ------  TREATMENT  --------  489  --------  499
insulin  ------  GENERIC_NAME  --------  553  --------  560
victoza  -----

### Addressing 'Interesting Observation'

explanation:   
- Stanza NER kinda favors detecting phrases as entities. E.g original - 'vomiting', 'nausea', 'diarrhoea' - 3 different entities, Stanza will detect it as 1 entity 'vomiting, nausea and diarrhoea'.    
- Overlapping is occuring in the original NER   


In [46]:
# let's see if 2nd point is true

overlapping_results = test_overlapping_original_entities(package_leaflets_NER)

overlapping_results

{'1': 2479, '2': 16437, '3': 2370, '4': 23045, '5': 37, '6': 200}

In [45]:
def test_overlapping_original_entities(package_leaflets_NER):
    
    COUNT_NER_OVERLAPPING = {'1':0, '2':0, '3':0, '4':0, '5':0, '6':0}
    
    for leaflet_idx in range(len(package_leaflets_NER)):
    
        for section_idx in range(1,7):
        
            if section_idx == 1:
                original_NER = package_leaflets_NER[leaflet_idx].section1.entity_recognition
                COUNT_NER_OVERLAPPING['1'] += test_overlapping_entities(original_NER)
        
            elif section_idx == 2:
                original_NER = package_leaflets_NER[leaflet_idx].section2.entity_recognition
                COUNT_NER_OVERLAPPING['2'] += test_overlapping_entities(original_NER)

            elif section_idx == 3:
                original_NER = package_leaflets_NER[leaflet_idx].section3.entity_recognition
                COUNT_NER_OVERLAPPING['3'] += test_overlapping_entities(original_NER)

            elif section_idx == 4:
                original_NER = package_leaflets_NER[leaflet_idx].section4.entity_recognition
                COUNT_NER_OVERLAPPING['4'] += test_overlapping_entities(original_NER)

            elif section_idx == 5:
                original_NER = package_leaflets_NER[leaflet_idx].section5.entity_recognition
                COUNT_NER_OVERLAPPING['5'] += test_overlapping_entities(original_NER)

            elif section_idx == 6:
                original_NER = package_leaflets_NER[leaflet_idx].section6.entity_recognition
                COUNT_NER_OVERLAPPING['6'] += test_overlapping_entities(original_NER)
    
    return COUNT_NER_OVERLAPPING