In [1]:
import pickle
import statistics

from EMA_documents import SectionLeaflet, Leaflet
from test_postprocessing_dataset import *

In [2]:
# load array of objects, where object - class Leaflet
with open("LEAFLET_DATASET_PROCESSED.pickle", "rb") as f:
    package_leaflets_raw = pickle.load(f)

In [3]:
len(package_leaflets_raw)

1336

In [4]:
# before performing NER with AWS

# count how many None values there were

COUNT_NONE_section_content = 0
COUNT_NONE_section_entities = 0

for leaflet in package_leaflets_raw:
    
    current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                leaflet.section3, leaflet.section4, 
                                leaflet.section5, leaflet.section6]
    
    for current_section in current_leaflet_sections:
        
        if current_section.section_content is None:
            COUNT_NONE_section_content += 1
        
        if current_section.entity_recognition == None:
            COUNT_NONE_section_entities += 1

In [5]:
print("Before NER: ")
print("Section Content is None: ", COUNT_NONE_section_content)
print("Entity Recognition is None: ", COUNT_NONE_section_entities)

Before NER: 
Section Content is None:  76
Entity Recognition is None:  8016


**Comment:** Numbers are correct! 
- Section Content is None: 76 (in leaflets with unique product_name). In preprocessing step, number of None section contents was 86, but for all the leaflets (including duplicate product_name). Another confirmation: in Perform_NER_leaflets_AWS notebook: Number of None Sections discovered during NER is also 76 (COUNT_NONE_SECTIONS)  
- len(package_leaflets_raw) * 6 = 8016 - total number of sections in dataset. By default, each section_content has a None value for the corresponding entity_recognition before performing NER      

In [6]:
# load array of objects, where object - class Leaflet
with open("LEAFLET_DATASET_PROCESSED_NER.pickle", "rb") as f:
    package_leaflets_ner = pickle.load(f)

In [7]:
len(package_leaflets_ner)

1336

In [8]:
# after NER

COUNT_NONE_section_content = 0
COUNT_NONE_section_entities = 0

for leaflet in package_leaflets_ner:
    
    current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                leaflet.section3, leaflet.section4, 
                                leaflet.section5, leaflet.section6]
    
    for current_section in current_leaflet_sections:
        
        if current_section.section_content is None:
            COUNT_NONE_section_content += 1
        
        if current_section.entity_recognition == None:
            COUNT_NONE_section_entities += 1

In [9]:
print("After NER: ")
print("Section Content is None: ", COUNT_NONE_section_content)
print("Entity Recognition is None: ", COUNT_NONE_section_entities)

After NER: 
Section Content is None:  76
Entity Recognition is None:  86


**Comment:** Numbers are correct!    
- Section Content is None: 76 (the number stays the same as before NER)     
- Entity Recognition is None:  86 ---- performed NER for all the section contents that are not NONE (*76*) and non-empty (*10*)   

### Check for duplicates section_content by comparing NER outputs of each section_content

In [149]:
### find out the number of duplicate NER outputs

# Keep Track of NER outputs 
# (key, value) - (NER_output, (num.occurences, section_indices containing NER_output , product_names(leaflets) containing NER_output))
NER_outputs = dict()

COUNT_DUPLICATE_NER_OUTPUTS = 0

# for each leaflet
for leaflet in package_leaflets_ner:
    
    current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                leaflet.section3, leaflet.section4, 
                                leaflet.section5, leaflet.section6]
    
    # for each section in a leaflet
    for section_index, current_section in enumerate(current_leaflet_sections):
        
        # skip None and empty sections
        if current_section.section_content is None or len(current_section.section_content) <= 1:
            continue
        
        # get only the 'Text' of entities
        current_section_entities = ''
        for entity in current_section.entity_recognition:
            current_section_entities += entity['Text'] + ' '
        
        # save to a dict unique NER outputs
        if current_section_entities not in NER_outputs:
            NER_outputs[current_section_entities] = (1, [section_index+1], [leaflet.product_name])
        
        # if current_section_entities is in NER_outputs - then it is a duplicate
        else:
            prev_num = NER_outputs[current_section_entities][0]
            
            # add the section index of the duplicate NER output to a list
            prev_indices = NER_outputs[current_section_entities][1]
            prev_indices.append(section_index+1)
            
            # add the product_name of the duplicate NER output to a list
            prev_names = NER_outputs[current_section_entities][2]
            prev_names.append(leaflet.product_name)
            
            NER_outputs[current_section_entities] = (prev_num+1, prev_indices, prev_names) 
            COUNT_DUPLICATE_NER_OUTPUTS += 1

In [150]:
print("The total number of duplicate NER Outputs: ", COUNT_DUPLICATE_NER_OUTPUTS)
print("Total num. of NER outputs: ", len(package_leaflets_ner)*6 - COUNT_NONE_section_entities)
print("The totak number of duplicate section contents discovered during NER:   153 (including empty sections)")

The total number of duplicate NER Outputs:  1224
Total num. of NER outputs:  7930
The totak number of duplicate section contents discovered during NER:   153 (including empty sections)


In [151]:
# sort the NER outputs by the number of occurences (descending order)
NER_outputs_sorted = dict(sorted(NER_outputs.items(), key=lambda item: item[1], reverse=True))

In [152]:
# save deuplicate NER_OUTPUTS that appear more than once in NER_outputs
DUPLICATE_NER_OUTPUTS = []

# total number of occurences of duplicate NER outputs
TOTAL_NUMBER_DUPLICATE_NER_OUTPUTS = 0

# for each NER output in dictionary 
for entities in NER_outputs_sorted:
    
    # in case NER output appears more than 1 - it is a duplicate
    if NER_outputs_sorted[entities][0] != 1:
        print(entities, " -------------------> ", NER_outputs_sorted[entities][0], ", section index-", statistics.mode(NER_outputs_sorted[entities][1]), ", products-", NER_outputs_sorted[entities][2])
        print()
        
        # count current NER output as the one having duplicates 
        DUPLICATE_NER_OUTPUTS.append(entities)
        
        # count how many times each duplicate NER output appears 
        # -1 since the 1st occurence of a NER output is considered to be unique
        TOTAL_NUMBER_DUPLICATE_NER_OUTPUTS += NER_outputs_sorted[entities][0] - 1

  ------------------->  952 , section index- 5 , products- ['Trydonis', 'Libtayo', 'Actelsar HCT', 'Sutent', 'Riprazo HCT', 'Mircera', 'Pelzont', 'Privigen', 'Mektovi', 'Savene', 'Zalasta', 'Nimvastid', 'Apealea', 'Optruma', 'Darunavir Krka', 'Ribavirin Mylan', 'Entresto', 'Blitzima', 'Xagrid', 'Olumiant', 'Lamivudine Teva Pharma B.V.', 'Deferasirox Accord', 'Telzir', 'Orkambi', 'Ceprotin', 'Qtern', 'Tevagrastim', 'Iressa', 'Dasselta', 'Myalepta', 'Duloxetine Lilly', 'Fetcroja', 'Movymia', 'Bemfola', 'Genvoya', 'Raxone', 'Xenleta', 'Erleada', 'Obizur', 'Ontruzant', 'Vyxeos liposomal', 'Emtriva', 'Rekovelle', 'Adcirca', 'Viread', 'Zytiga', 'Prolia', 'Scenesse', 'Mirapexin', 'Ryzodeg', 'Efficib', 'Trevicta', 'Trizivir', 'Raloxifene Teva', 'Jinarc', 'ImmunoGam', 'Actos', 'Voriconazole Accord', 'Lacosamide UCB', 'Xeplion', 'Pritor', 'Adrovance', 'Rebif', 'Tookad', 'Rasitrio', 'Xtandi', 'Duaklir Genuair', 'Thymanax', 'Senshio', 'Myclausen', 'Nilemdo', 'Nilemdo', 'Forsteo', 'Cabometyx', 'Ide

In [153]:
# make sure number of duplicate NER outputs is same as calculated before
TOTAL_NUMBER_DUPLICATE_NER_OUTPUTS == COUNT_DUPLICATE_NER_OUTPUTS

True

In [154]:
print("Number of NER outputs that appear more than once: ", len(DUPLICATE_NER_OUTPUTS))

Number of NER outputs that appear more than once:  156


In [155]:
DUPLICATE_NER_OUTPUT_SECTION1 = 0
DUPLICATE_NER_OUTPUT_SECTION2 = 0
DUPLICATE_NER_OUTPUT_SECTION3 = 0
DUPLICATE_NER_OUTPUT_SECTION4 = 0
DUPLICATE_NER_OUTPUT_SECTION5 = 0
DUPLICATE_NER_OUTPUT_SECTION6 = 0

# for each NER output in dictionary 
for entities in NER_outputs_sorted:
    
    # in case NER output appears more than 1 - it is a duplicate
    if NER_outputs_sorted[entities][0] != 1:
        
        
        num_occurences = NER_outputs_sorted[entities][0]
        duplicate_NER_output_indices = NER_outputs_sorted[entities][1]
        
        assert len(duplicate_NER_output_indices) == num_occurences
        
        # first index is considered to be unique NER output
        for index in duplicate_NER_output_indices[1:]:
            if index == 1: DUPLICATE_NER_OUTPUT_SECTION1 += 1
            elif index == 2: DUPLICATE_NER_OUTPUT_SECTION2 += 1
            elif index == 3: DUPLICATE_NER_OUTPUT_SECTION3 += 1
            elif index == 4: DUPLICATE_NER_OUTPUT_SECTION4 += 1
            elif index == 5: DUPLICATE_NER_OUTPUT_SECTION5 += 1
            elif index == 6: DUPLICATE_NER_OUTPUT_SECTION6 += 1

print("Occurences of duplicate NER output in Section1", DUPLICATE_NER_OUTPUT_SECTION1)
print("Occurences of duplicate NER output in Section2", DUPLICATE_NER_OUTPUT_SECTION2)
print("Occurences of duplicate NER output in Section3", DUPLICATE_NER_OUTPUT_SECTION3)
print("Occurences of duplicate NER output in Section4", DUPLICATE_NER_OUTPUT_SECTION4)
print("Occurences of duplicate NER output in Section5", DUPLICATE_NER_OUTPUT_SECTION5)
print("Occurences of duplicate NER output in Section6", DUPLICATE_NER_OUTPUT_SECTION6)

Occurences of duplicate NER output in Section1 54
Occurences of duplicate NER output in Section2 6
Occurences of duplicate NER output in Section3 25
Occurences of duplicate NER output in Section4 46
Occurences of duplicate NER output in Section5 1061
Occurences of duplicate NER output in Section6 32


### For duplicate NER output, check whether the corresponding section contents are duplicates too

In [156]:
### for fast look-up - save info in a dictionary

# key = product_name (good thing that product names are unique, so use as a key)
# value = another dict (key - section_number, value = section_content)

product_sections = dict()

# for each leaflet
for leaflet in package_leaflets_ner:
    
    current_leaflet_sections = [leaflet.section1.section_content, leaflet.section2.section_content, 
                                leaflet.section3.section_content, leaflet.section4.section_content, 
                                leaflet.section5.section_content, leaflet.section6.section_content]
    
    product_sections[leaflet.product_name] = dict()
    
    # for each section in a leaflet
    for section_index, section_content in enumerate(current_leaflet_sections):
        
        if (section_index+1) == 1: product_sections[leaflet.product_name]['1'] = section_content
        elif (section_index+1) == 2: product_sections[leaflet.product_name]['2'] = section_content
        elif (section_index+1) == 3: product_sections[leaflet.product_name]['3'] = section_content
        elif (section_index+1) == 4: product_sections[leaflet.product_name]['4'] = section_content
        elif (section_index+1) == 5: product_sections[leaflet.product_name]['5'] = section_content
        elif (section_index+1) == 6: product_sections[leaflet.product_name]['6'] = section_content

In [166]:
NUMBER_DUPLICATE_SECTIONS = 0

duplicate_sections = dict()

# for each NER output in dictionary 
for entities in NER_outputs_sorted:
    
    # in case NER output appears more than 1 - it is a duplicate
    if NER_outputs_sorted[entities][0] != 1:
        
        # the 1st occurence of (duplicate) NER_output is considered to be unique 
        num_occurences = NER_outputs_sorted[entities][0] - 1
        
        duplicate_NER_output_sections_num = NER_outputs_sorted[entities][1]
        duplicate_NER_output_sections_num = duplicate_NER_output_sections_num[1:]
        
        duplicate_NER_output_product_names = NER_outputs_sorted[entities][2]
        duplicate_NER_output_product_names = duplicate_NER_output_product_names[1:]
        
        assert len(duplicate_NER_output_sections_num) == num_occurences
        assert len(duplicate_NER_output_product_names) == num_occurences
        
        for index_current in range(len(duplicate_NER_output_product_names)-1):
            
            current_prod_name = duplicate_NER_output_product_names[index_current]
            current_section_num = str(duplicate_NER_output_sections_num[index_current])
            
            for index_next in range(index_current+1, len(duplicate_NER_output_product_names)):
                
                next_prod_name = duplicate_NER_output_product_names[index_next]
                next_section_num = str(duplicate_NER_output_sections_num[index_next])
                
                # check whether sections are duplicates if corresponding NER output is same
                if product_sections[current_prod_name][current_section_num] == product_sections[next_prod_name][next_section_num]:
                    
                    NUMBER_DUPLICATE_SECTIONS += 1
                    
                    duplicate_section = product_sections[current_prod_name][current_section_num]
                    
                    if duplicate_section in duplicate_sections:
                        prev_names = duplicate_sections[duplicate_section]
                        if current_prod_name not in prev_names: prev_names.append(current_prod_name)
                        if next_prod_name not in prev_names: prev_names.append(next_prod_name)
                        duplicate_sections[duplicate_section] = prev_names
                    else:
                        duplicate_sections[duplicate_section] = [current_prod_name, next_prod_name]
            
            

In [172]:
NUMBER_DUPLICATE_SECTIONS

200

In [173]:
len(duplicate_sections)

84

In [176]:
duplicate_sections

{'keep this medicine out of the sight and reach of children. do not use this medicine after the expiry date which is stated on the carton and the blister after exp. do not store above 30. store in the original packaging in order to protect from light and moisture. do not throw away any medicines via wastewater or household waste. ask your pharmacist how to throw away medicines you no longer use. these measures will help protect the environment.': ['Pelzont',
  'Trevaclyn',
  'Tredaptive'],
 'keep this medicine out of the sight and reach of children. do not use this medicine after the expiry date which is stated on the blister and the carton after exp. the expiry date refers to the last day of that month. this medicine does not require any special storage conditions. do not throw away any medicines via wastewater or household waste. ask your pharmacist how to throw away medicines you no longer use. these measures will help protect the environment.': ['Mektovi',
  'Zonisamide Mylan',
  '