In [1]:
# perform NER with AWS Comprehend
import boto3
client = boto3.client(service_name='comprehendmedical', region_name='us-east-1')

In [2]:
import pickle

from EMA_documents import SectionLeaflet, Leaflet
from test_postprocessing_dataset import *

In [3]:
# load array of objects, where each object is a Leaflet
with open("LEAFLET_DATASET_PROCESSED.pickle", "rb") as f:
    package_leaflets = pickle.load(f)

In [4]:
# number of uniques processed leaflets  
len(package_leaflets)

1336

In [5]:
# make sure that there are no duplicates in leaflets and just 144 section duplicates (mainly Section 5)
test_remove_duplicates(package_leaflets)

test_duplicate_sections(package_leaflets)

Number of unique IDs:  1336
Number of duplicate IDs:  0
Number of unique URLs:  1336
Number of duplicate URLs:  0
Number of unique product names:  1336
Number of duplicate product names:  0
Number of duplicate sections marked with a flag:  144


In [6]:
# Perform NER of each section in a leaflet with AWS Comprehend - !!! infer_icd10_cm

# track the number of duplicate sections
COUNT_DUPLICATE_SECTIONS = 0

# track the number of None sections
COUNT_NONE_SECTIONS = 0

# save (key, value) as unique_section --- NER output 
unique_sections = dict()


for leaflet_index, leaflet in enumerate(package_leaflets):
    
    # Progress Bar
    print("Processing: ", leaflet.product_name, leaflet.id)
    if leaflet_index % 100 == 0:
        print(" =============== Number of Document Processed: {}".format(leaflet_index+1))
    
    current_leaflet_sections = [leaflet.section1.section_content, leaflet.section2.section_content, 
                                leaflet.section3.section_content, leaflet.section4.section_content, 
                                leaflet.section5.section_content, leaflet.section6.section_content]
    
    for section_index, current_section in enumerate(current_leaflet_sections):
        
        # skip None Sections
        if current_section is None:
            COUNT_NONE_SECTIONS += 1
            continue
        
        # check whether current section is a duplicate section
        if current_section in unique_sections:
            # get NER output from the dict
            entities_current_section = unique_sections[current_section]
            COUNT_DUPLICATE_SECTIONS += 1
            print("DUPLICATE SECTION ", section_index+1, ": ", leaflet.product_name, leaflet.id)

        # if not duplicate section - Perform NER
        else:
            entities_current_section = None

            if current_section is not None and len(current_section) > 1:
                # get the detected entities for current section
                
                # handle Exceptions
                try:
                    AWS_response = client.infer_icd10_cm(Text=current_section)
                    entities_current_section = AWS_response['Entities']
                except Exception as error:
                    print('Failed infer_icd10_cm NER for Section', section_index+1, ' ----> ', error)
                    entities_current_section = None
            
            # store (section, entities_section) in a dict
            unique_sections[current_section] = entities_current_section

        # save the detected entities in SectionLeaflet
        if (section_index+1) == 1:
            leaflet.section1.entity_recognition = entities_current_section
        elif (section_index+1) == 2:
            leaflet.section2.entity_recognition = entities_current_section
        elif (section_index+1) == 3:
            leaflet.section3.entity_recognition = entities_current_section
        elif (section_index+1) == 4:
            leaflet.section4.entity_recognition = entities_current_section
        elif (section_index+1) == 5:
            leaflet.section5.entity_recognition = entities_current_section
        elif (section_index+1) == 6:
            leaflet.section6.entity_recognition = entities_current_section

Processing:  Trydonis 0017747DB4322B3733E85E2DA594CF8C
Processing:  Prevenar 13 0042EDF3623C5FD8C8BAB7CA9E3AE597
Processing:  Libtayo 0079B64B42A0902F941E061FD21F13EC
Processing:  Clopidogrel Apotex 00D784D36A6F97276B4CACA1A67234D2
Processing:  Actelsar HCT 00DEDA6DA22C832306FF91D593693139
Processing:  Taxespira 00E9FAFB19B9FBCFC6D2ED9C180BBD7F
Processing:  Sutent 00F231630A4DC00C558E6070A5A68EAD
Processing:  Riprazo HCT 0162EFDB3D97EC9E4E91C82034D9BCD5
Processing:  Mircera 0172AD04C2D32E690800B2AFDADC33EB
Processing:  Pelzont 01FE993F42EAE4277800243A3ECA6170
Processing:  Privigen 020668EE5C7E5397A9B22271C390C3A3
Processing:  Mektovi 02350007F0ACAAEFE419DCA91D6E98A0
Processing:  Savene 03575C82E36D8D17989640424D78AD0D
Processing:  Zalasta 0371325F6E7DD4AF1F9746B189C03A4D
Processing:  Vpriv 0375DDBA49FBD7F48FC61C26A23052BF
Processing:  Nimvastid 03D477D48CC1C98EF76E12792056B93F
Processing:  Apealea 04034542AB28557EFC1CC133B66770F7
Processing:  Optruma 041AEE1898650515BAD8AA5DBB0A8ACE
Pr

Processing:  Zirabev 1987ED75EF0886DA6ED6271439BFA07A
Processing:  Poteligeo 19C4FF9ABC123865A5D42C90DF3D6312
Processing:  Tegsedi 19D1556A958E1BF6BEC9BD88A12A46CF
Processing:  Ibandronic Acid Sandoz 19EBD70955B55C64784AE8037932A5ED
Processing:  Nivestim 1A05B4352E939DCDAC759FC6621CE1D6
Processing:  Mysildecard 1A065C11AE75EF8848AE934ED08C84FE
Processing:  Ovitrelle 1A0DF936BD67C7F622D86FE80EE25A26
Processing:  Invega 1A1475B745CC6FD324EC411AC7EF31B2
Processing:  Seebri Breezhaler 1A2F52FE35A77AEA51F5D608DB81F50A
Processing:  Mulpleo 1A4AB3B47BE9CCF75421A82BCB5D8354
Processing:  Spherox 1A65B3600D3032B15FFFD3227D039273
Processing:  Olazax 1A88BA776DF83C0CEAAC672B443206DC
Processing:  Xyrem 1A8CFDB034D73903CD7A466779FC69FD
Processing:  DaTSCAN 1AA23070BA3569FAC45E8430B35AA23A
Processing:  Dectova 1AA577B1990CF187749BA38065EB47F4
Processing:  Docetaxel Teva Pharma 1AD6C4755C173E648C84E1BFAEE2E3FC
Processing:  Benlysta 1AF962ECC915FA5B0638584FE0AF119F
Processing:  Juluca 1B1468C37450ACE77

DUPLICATE SECTION  5 :  Velmetia 2E7FD74EDF85E8E1C4E052AD7E64FDF0
Processing:  Zinplava 2E910610B8599230893F6D56BD5CDA16
Processing:  Talzenna 2F62E98BF4FE4BD01565F82B68BFA91F
Processing:  Afinitor 2F69C58B53BD4BB60C758EB36620BD2B
Processing:  Esmya 2F7A5D679B3A956AB6EBB5605F9E8AE4
Processing:  Xydalba 2F7FF4E4ABE3919FC4BB4EA09D21FDF7
Processing:  Elaprase 2F819B4E3127C1F56B521862BC986E16
Processing:  Champix 2F950C37E07AE07FB5B7743CDA2CD95A
Processing:  Mysimba 2FA716FC0E58A054860F6F33FCFCCCA0
Processing:  Gazyvaro 2FACB377F58985B93B438528E7B24319
Processing:  Orfadin 2FEA555C18511BF8457BD6CBF77D99BE
Processing:  Lonsurf 3035AFE1837E24D934D1A4E22F5F233F
Processing:  Valtropin 309DF43284F585D9C503BCD75AF04BFD
Processing:  Clopidogrel BGR 30A9F9160D90BDFCF0D8E385C143C7AA
Processing:  Fosavance 313E8311624EF2BA7B623303B00B9C54
DUPLICATE SECTION  5 :  Fosavance 313E8311624EF2BA7B623303B00B9C54
Processing:  Hyrimoz 31BB301310FF990F78C7276F538C2461
Processing:  Spedra 31DB482E3F1FF49160B308

Processing:  Idacio 3F8D9383FDC6DAF516FBCE29EA6D0D00
Processing:  Esperoct 3FAD14C423D0E50B5E41A3F4456212B3
Processing:  Clopidogrel Hexal 3FB3BFD36F9BA18714A202C2AE6D680C
Processing:  Kalydeco 3FE56AC9B21652526B2038E90EEF2831
Processing:  Esbriet 404E156018E8115E1532CFD1F3FE167B
Processing:  Livensa 40B1259144EC30954A34397C78B82CB3
Processing:  Brilique 4102ED063BB0171B37B6BDF6463572DA
Processing:  Sifrol 41946E2113ED78355098DE8FE0586FB7
DUPLICATE SECTION  5 :  Sifrol 41946E2113ED78355098DE8FE0586FB7
Processing:  Capecitabine Accord 41AA4D40E0EB7A31DEE39BE73E596FD0
Processing:  Laventair Ellipta 41E281CD11989F344B26E53686B69166
DUPLICATE SECTION  5 :  Laventair Ellipta 41E281CD11989F344B26E53686B69166
Processing:  Onsenal 4263665B0FB8C7F2A0039D03A3678097
Processing:  Sunosi 4263B8E4102AC182B3903F91237300FB
Processing:  Xiliarx 42E83710738D761A92621CCCAAA13738
Processing:  Trecondi 42EED1A944ECE439CA570EC4C8EC18CB
Processing:  Vizimpro 4330E531ADB29D0C687F75530B088D98
Processing:  Galv

Processing:  Ovaleap 53FE474929C7C6B023B752F217E68E8E
Processing:  Libmeldy 5442AE27DD42AD3FA8AC4AA827993B55
Processing:  Azacitidine Accord 54479F3AD2D59BBE9A92E53CB7D5482D
Processing:  Zeffix 5464FDE98E3AA0628E98D9388162468A
Processing:  Posaconazole AHCL 547DDE6AEE8AF9B9BFBEC76B229D40CC
Processing:  Xultophy 5486A41640FCCCD7E130ED39F0918CAD
Processing:  Sepioglin 5536A462B0F3C4097B1142A265CC7951
DUPLICATE SECTION  2 :  Sepioglin 5536A462B0F3C4097B1142A265CC7951
Processing:  Copalia 555A44812C0D4677A70315F772DD1BB7
Processing:  Rybelsus 557B74950859EC9440169429CD4AD8B8
Processing:  Pramipexole Teva 559F0C84299CE58C871877FCD9DEC37E
Processing:  Aybintio 55B10351E856BBE3BA9164FD7B130AEB
Processing:  Trumenba 5602C798EBE9A8F0220C81D46409037A
Processing:  Briviact (in Italy: Nubriveo) 561498E56ADC6C992CE62E94EBD54D7F
Processing:  Aflunov 5637956D490EE825295EE10543BA04B9
Processing:  Tyverb 5638B6384F757641EB662029C83954C2
Processing:  Tovanor Breezhaler 564078C8E4D29FD69DAB261453EDA1D9
D

DUPLICATE SECTION  5 :  Clopidogrel BMS 6C6339081BE02A95661F401C6538D4AB
Processing:  Invirase 6C67378ABDA0766F390118BC88E4737E
Processing:  Ebixa 6C8F32B8E78B70F24DB07600D64BDEB2
Processing:  Karvezide 6C99CEF9396A64FFF712CB40A099E8B9
DUPLICATE SECTION  5 :  Karvezide 6C99CEF9396A64FFF712CB40A099E8B9
Processing:  Constella 6CA79A0F97A722C055536AD32F4A73FD
Processing:  Rezolsta 6CB969F2499F633EEBB8FEE9A8A6C325
Processing:  Jyseleca 6CCD0F993CBABD135040671BEE7D3329
Processing:  Verkazia 6CCE108E2C0AD06F4D359B7F2DE170FF
Processing:  Adempas 6CDC1891D4F0008195113AC6036ACC58
Processing:  Grasustek 6CEDC70614C3A802A5692D9CE10694A3
Processing:  Busulfan Fresenius Kabi 6D028312B125DDC5871D0742552C6483
Processing:  Pramipexole Accord 6D28C8B51306F531C35ED7212C1BB5D5
Processing:  Fluenz Tetra 6D36AC0C920F41A953221C5F2C3C4AE2
Processing:  Quadramet 6D4D7189BDF5A3A9AF34933907789CA8
Processing:  Optison 6D5CA133482521A9EBB590D3E060D594
Processing:  Adynovi 6DAB5FAC789638C914E32A5DB6FC1B23
Processi

Processing:  Zepatier 815539D23B652196A05DDEDFA8A8620F
Processing:  Striascan 815A028C3DAE859DE353DBDB14391005
Processing:  DuoResp Spiromax 81C29A8A799C4553E1E8E85DDEF54753
DUPLICATE SECTION  5 :  DuoResp Spiromax 81C29A8A799C4553E1E8E85DDEF54753
Processing:  Efavirenz Teva 81C32B1CADBCB96578F91BE0A66AC5FD
Processing:  Zonisamide Mylan 81E8FC1CD4C46B21D0B4473881CFF91A
DUPLICATE SECTION  5 :  Zonisamide Mylan 81E8FC1CD4C46B21D0B4473881CFF91A
Processing:  Erbitux 821336EFD47D07C368367192702AB100
Processing:  Protelos 82174A4F38AF3AA554C9CEBE0E2407DA
DUPLICATE SECTION  5 :  Protelos 82174A4F38AF3AA554C9CEBE0E2407DA
Processing:  Quofenix 8240EDC8345CD1E98EE1F5A7445AF604
Processing:  Doribax 82483A236F6C923DB3DD43A04A8E431A
Processing:  Bevespi Aerosphere 82489FE8F863ABA2E94C5110A813A2E1
Processing:  Pemetrexed Fresenius Kabi 825DC5753EB7B3CD3D1D02F129D59180
Processing:  Zomarist 82F84CAAA7D4A2B14CFF02AB15687D3F
DUPLICATE SECTION  5 :  Zomarist 82F84CAAA7D4A2B14CFF02AB15687D3F
Processing: 

Processing:  Krystexxa 99EC2690BD3D18AF7E4AFBE438374EA0
Processing:  Diacomit 99FB2A4388CC834090CE295BD4AC3F1B
Processing:  Alimta 9A06D3298D996FE2D21BEEFB4041041A
Processing:  Holoclar 9A1C8A66C3C66AA2953CE04CE572C823
Processing:  Aripiprazole Zentiva 9A41613B51C2B72D58D71772D83A306A
DUPLICATE SECTION  5 :  Aripiprazole Zentiva 9A41613B51C2B72D58D71772D83A306A
Processing:  Fasturtec 9A689AE7A7A56E0E3DE36E5A2B02ED5C
Processing:  Docetaxel Accord 9AC2B882B2BFFB022220AC9838C866B4
Processing:  Entyvio 9ADA2D5373392025F0237AB6107BCDA9
Processing:  Pegasys 9AF3DB1C5F8EDB28A66B8F43BF659B9F
Processing:  Dafiro 9B009A6F98BC45924F12DE327BA06FFA
Processing:  Komboglyze 9B60A667B1D51635405C3556DA014F8B
Processing:  Levetiracetam Hospira 9B99E7C93A05B48B0536353EC5DF9F7F
Processing:  Emadine 9C0E9E79355AADF72F8268352602134F
Processing:  Exforge 9C124061ED53F937124DAA61E7E2D515
Processing:  Lyumjev 9C2173087A8DEF2A06F9ACFA2600CAB1
Processing:  Plenadren 9C309C7076A1128A293A2E052BAEBD83
Processing:  

Processing:  Mozobil B20D7E3BDB127C95CE4F32DA7ACC8D91
Processing:  Insulin Human Winthrop B2341ED0915D6A2FD7055BED8ECF64AD
Processing:  Pheburane B2588623176B13D953CE73577214D27D
Processing:  Tresiba B25A04723B1AA7658FC0C94C528A6083
Processing:  Zaltrap B27F39D25E950705B72377442FC3E930
Processing:  Nevirapine Teva B286299130E6E7015A30B37703246CD9
Processing:  Posaconazole SP B2CFDBB072AEE5A3FABA2745364EF16E
Processing:  Edurant B3FE8B38F9A8C181D2AC33F10B330A6D
Processing:  Nivolumab BMS B4116106866ABDA0DA90FA81F61DE066
DUPLICATE SECTION  5 :  Nivolumab BMS B4116106866ABDA0DA90FA81F61DE066
Processing:  Fabrazyme B4CDA41D69EE98A190FDBE2521037DAE
Processing:  Zoledronic acid Teva Pharma B4FF9CF0D9E8468E924D78C32FE170B7
Processing:  Hepsera B5237946CAA5B2D2074AD2390DBDE939
Processing:  Irbesartan BMS B5329FC7F67D2A3157F38FC5C5AE3936
Processing:  Ameluz B5E3346F996EA24129464AA09B6D5BDB
Processing:  Intrarosa B5F4B1A6811BA7FB9024E509326CEA8B
Processing:  Biopoin B645E35B0609CFF235954EC97E275

Processing:  Prandin CE88529822704B4A159BC9510E3B9868
DUPLICATE SECTION  4 :  Prandin CE88529822704B4A159BC9510E3B9868
DUPLICATE SECTION  5 :  Prandin CE88529822704B4A159BC9510E3B9868
Processing:  Besremi CE8B271B4933928FBE6D41ED896A52C4
Processing:  Avandia CE9D3947AC5CAA8C824AB64D71AE1142
Processing:  Dovprela CF4FFD65468BE4C7690A8FA976C40541
Processing:  Opatanol CF75F2BBBE1786993637082F6F807AB0
Processing:  Caprelsa CF7B4EF8076ECC17CBCB2A7ABFEFC5B8
Processing:  Zoledronic acid Actavis D04B147E91C8B8F906DDA894B246AC2C
Processing:  Ribavirin Teva Pharma B.V. D0CC9635D229DB1D224636A7078CA5D9
Processing:  Halimatoz D0D5487AA9CBDCA8E607244896479A32
Processing:  LysaKare D1085E1D6A88A39DC5675E1CF63392F7
Processing:  Febuxostat Mylan D11034828429F4E1BD4B04475A0B8CF1
Processing:  Farydak D151F97A5AD8B5F9FF3AF280F2883C51
Processing:  Besponsa D16BFF0DD5A312927F30E7142D797E89
Processing:  Irbesartan Hydrochlorothiazide Zentiva D20311E4EDCC24119FDFAF37DA30CCEE
DUPLICATE SECTION  5 :  Irbesart

DUPLICATE SECTION  5 :  Karvea ECC8F8F2976AC234C5261385DD9AEC5E
Processing:  Eporatio ED5A91044D2EC28F2AC087DFAA2AD668
Processing:  Revinty Ellipta EDAB7942C112A81D7F329B6C99B0865B
DUPLICATE SECTION  5 :  Revinty Ellipta EDAB7942C112A81D7F329B6C99B0865B
Processing:  Topotecan Eagle EDB5291493C383CFCD0A59D0AA5B57A4
Processing:  Xermelo EDCEDBE56018CE371FD5D66B4DDE9CC8
Processing:  Kisqali EDE61F8A64F525E5A53C77CC05ECC873
Processing:  Bavencio EDF01D71F6AA87ACD46525EEAC968AC7
Processing:  Ventavis EE524A8588F4979B3558C029A690EC6C
Processing:  Clopidogrel Acino Pharma EE9A4CC8C8704A9FBCD64E32FA126B91
Processing:  Lenalidomide Mylan EF1CB57074D7204F0606CC56ACFB1976
Processing:  Viracept EFD58274DC6D3099E0917AAB8FFD0582
Processing:  Entacapone Teva EFFA02A0AE71C7D9E1410C070300504E
Processing:  Zavesca F04B573F1A1E5EC6EE19994EFF1F12EF
Processing:  Pretomanid FGK F06CB0C715721E3C777AF60468921DA1
DUPLICATE SECTION  5 :  Pretomanid FGK F06CB0C715721E3C777AF60468921DA1
Processing:  Protopy F06F8

In [7]:
# save results
with open("LEAFLET_DATASET_PROCESSED_InferICD10CM.pickle", "wb") as f:
    pickle.dump(package_leaflets, f)

In [8]:
# save results (Backup)
with open("LEAFLET_DATASET_PROCESSED_InferICD10CM_BACKUP.pickle", "wb") as f:
    pickle.dump(package_leaflets, f)

In [9]:
print("Number of None Sections discovered during NER: ", COUNT_NONE_SECTIONS)
print("Number of Duplicate Sections discovered during NER: ", COUNT_DUPLICATE_SECTIONS)

Number of None Sections discovered during NER:  76
Number of Duplicate Sections discovered during NER:  153


In [10]:
len(unique_sections)

7787

Numbers are correct!  
- Number of None Sections discovered during NER:  76 (for 1336 leaflets with unique product_name). For leaflets with duplicate product_name, number of None Sections was 86  
- Number of unique sections = 7787 --- same number as in preprocessing step  
- Number of Duplicate Sections discovered during NER:  153 (including empty sections) --- 144 duplicate section as in processing step (excluding empty section contents) + 9 (empty section contents)    