In [8]:
import os
import zipfile
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import pprint
import pandas as pd
import uuid
import json
import os
import glob
import re
import sys
from bs4 import NavigableString, BeautifulSoup
from collections import defaultdict
import random
import string

from utils.config import config
from utils.logger.logger import loggerCreator

# ePI Modules
from parse.rulebook.rulebook import StyleRulesDictionary

from parse.extractor.parser import parserExtractor
from match.matchDocument.matchDocument import MatchDocument
from documentAnnotation.documentAnnotation import DocumentAnnotation
from htmlDocTypePartitioner.partition import DocTypePartitioner
from extractContentBetweenHeadings.dataBetweenHeadingsExtractor import DataBetweenHeadingsExtractor
from fhirXmlGenerator.fhirXmlGenerator import FhirXmlGenerator
from fhirService.fhirService import FhirService
from utils.logger.matchLogger import MatchLogger
from languageInfo.documentTypeNames.documentTypeNames import DocumentTypeNames
from wordToHtmlConvertor.wordToHtmlConvertor import WordToHtmlConvertor


class FolderNotFoundError(Exception):
    pass


def getRandomString(N):
    str_ = ''.join(random.choice(string.ascii_uppercase + string.digits
                                 + string.ascii_lowercase) for _ in range(N))
    return str_


def convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog):

    module_path = os.path.join(basePath)

    if "/" in basePath:
        pathSep = "/"
    else:
        pathSep = "\\"
    
    # Generate output folder path
    output_json_path = os.path.join(basePath, 'outputJSON')

    """
        Check if input folder exists, else throw exception
    """
    if(os.path.exists(module_path)):
        filenames = glob.glob(os.path.join(module_path, htmlDocName))

        # Create language specific folder in outputJSON folder if it doesn't exist
        if(not os.path.exists(output_json_path)):
            os.mkdir(output_json_path)
        logger = MatchLogger(f'Parser_{getRandomString(1)}', htmlDocName,
                             domain, procedureType, languageCode, "HTML", fileNameLog)

        styleLogger = MatchLogger(
            f'Style Dictionary_{getRandomString(1)}', htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)

        styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                             controlBasePath=controlBasePath,
                                             language=languageCode,
                                             fileName=fileNameQrd,
                                             domain=domain,
                                             procedureType=procedureType
                                             )

        parserObj = parserExtractor(config, logger, styleRulesObj.styleRuleDict,
                                    styleRulesObj.styleFeatureKeyList,
                                    styleRulesObj.qrd_section_headings)

        for input_filename in filenames:
          # if(input_filename.find('Kalydeco II-86-PI-clean')!=-1):
            output_filename = os.path.join(output_json_path, htmlDocName)
            style_filepath =  output_filename.replace('.html','.txt')
            style_filepath =  style_filepath.replace('.txtl','.txt')
            style_filepath =  style_filepath.replace('.htm','.txt')
            print("-------------",style_filepath,"-----------------")

            output_filename = output_filename.replace('.html', '.json')
            output_filename = output_filename.replace('.htm', '.json')
            print(input_filename, output_filename)
            parserObj.createPIJsonFromHTML(input_filepath=input_filename,
                                           output_filepath=output_filename,
                                           style_filepath = style_filepath,
                                           img_base64_dict=parserObj.convertImgToBase64(input_filename)
                                           )
            
        return output_filename.split(pathSep)[-1], style_filepath
    else:
        try:    
            raise FolderNotFoundError(module_path + " not found")
        except:  
            logger.logFlowCheckpoint("Folder For Language Code Not Found In Input File")
            logger.logException("Folder For Language Code Not Found In Input File")
        raise FolderNotFoundError(module_path + " not found")
        return None


def splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog):

    styleLogger = MatchLogger(
        f'Style Dictionary_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                        controlBasePath=controlBasePath,
                                        language=languageCode,
                                        fileName=fileNameQrd,
                                        domain=domain,
                                        procedureType=procedureType
                                        )
    
    path_json = os.path.join(basePath,'outputJSON', fileNameJson)
    print("PathJson",path_json)
    partitionLogger = MatchLogger(
        f'Partition_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    partitioner = DocTypePartitioner(partitionLogger)

    partitionedJsonPaths = partitioner.partitionHtmls(
        styleRulesObj.qrd_section_headings, path_json)

    return partitionedJsonPaths


def extractAndValidateHeadings(controlBasePath,
                                basePath,
                                domain,
                                procedureType,
                                languageCode,
                                documentNumber,
                                fileNameDoc,
                                fileNameQrd,
                                fileNameMatchRuleBook,
                                fileNameDocumentTypeNames,
                                fileNameLog,
                                stopWordFilterLen=6,
                                isPackageLeaflet=False,
                                medName=None
                                ):

    if documentNumber == 0:
        topHeadingsConsidered = 4
        bottomHeadingsConsidered = 6
    elif documentNumber == 1:
        topHeadingsConsidered = 3
        bottomHeadingsConsidered = 5
    elif documentNumber == 2:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 15
    else:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 10

    print(f"Starting Heading Extraction For File :- {fileNameDoc}")
    logger = MatchLogger(f"Heading Extraction {fileNameDoc}_{getRandomString(1)}", fileNameDoc, domain, procedureType, languageCode, documentNumber, fileNameLog)
    logger.logFlowCheckpoint("Starting Heading Extraction")

    stopWordlanguage = DocumentTypeNames(
        controlBasePath=controlBasePath,
        fileNameDocumentTypeNames=fileNameDocumentTypeNames,
        languageCode=languageCode,
        domain=domain,
        procedureType=procedureType,
        documentNumber=documentNumber
        ).extractStopWordLanguage()

    matchDocObj = MatchDocument(
        logger,
        controlBasePath,
        basePath,
        domain,
        procedureType,
        languageCode,
        documentNumber,
        fileNameDoc,
        fileNameQrd,
        fileNameMatchRuleBook,
        fileNameDocumentTypeNames,
        topHeadingsConsidered,
        bottomHeadingsConsidered,
        stopWordFilterLen,
        stopWordlanguage,
        isPackageLeaflet,
        medName)
    df, coll = matchDocObj.matchHtmlHeaddingsWithQrd()

    return df, coll


def parseDocument(controlBasePath, basePath ,htmlDocName, fileNameQrd, fileNameMatchRuleBook, fileNameDocumentTypeNames, medName = None):
    
    if "/" in basePath:
        pathSep = "/"        
    else:
        pathSep = "\\"
    
    fileNameLog = os.path.join(basePath,'FinalLog.txt')

    pathComponents = basePath.split(pathSep)
    print(pathComponents, htmlDocName)
    timestamp = pathComponents[-1]
    languageCode =  pathComponents[-2]
    medName = pathComponents[-3]
    procedureType = pathComponents[-4]
    domain = pathComponents[-5]

    print(timestamp, languageCode, medName, procedureType, domain)
        
    flowLogger =  MatchLogger(f"Flow Logger HTML_{getRandomString(1)}", htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)

    flowLogger.logFlowCheckpoint("Starting HTML Conversion To Json")
    ###Convert Html to Json
    fileNameJson, stylesFilePath = convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog)
    
    print("stylePath:-",stylesFilePath)
    flowLogger.logFlowCheckpoint("Completed HTML Conversion To Json")

    flowLogger.logFlowCheckpoint("Starting Json Split")

    ###Split Uber Json to multiple Jsons for each category.
    partitionedJsonPaths = splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog)
    
    partitionedJsonPaths = [ path.split(pathSep)[-1] for path in partitionedJsonPaths]
    flowLogger.logFlowCheckpoint(str(partitionedJsonPaths))
    
    flowLogger.logFlowCheckpoint("Completed Json Split")
    
    flowLogger.logFlowCheckpoint("Started Processing Partitioned Jsons")
    
    for index, fileNamePartitioned in enumerate(partitionedJsonPaths):
        
        flowLogger.logFlowCheckpoint(f"\n\n\n\n||||||||||||||||||||||||||||||||{str(index)} ||||| {str(fileNamePartitioned)}||||||||||||||||||||||||||||||||\n\n\n\n")
        
        if index == 3:
            stopWordFilterLen = 100
            isPackageLeaflet = True
        else:
            stopWordFilterLen = 6
            isPackageLeaflet = False
            
        df, coll = extractAndValidateHeadings(controlBasePath,
                                    basePath,
                                    domain,
                                    procedureType,
                                    languageCode,
                                    index,
                                    fileNamePartitioned,
                                    fileNameQrd,
                                    fileNameMatchRuleBook,
                                    fileNameDocumentTypeNames,
                                    fileNameLog,
                                    stopWordFilterLen=stopWordFilterLen,
                                    isPackageLeaflet=isPackageLeaflet,
                                    medName=medName)
        
        
        print(f"Completed Heading Extraction For File")
        flowLogger.logFlowCheckpoint("Completed Heading Extraction For File")
        
        print(f"Starting Document Annotation For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Document Annotation For File")
        documentAnnotationObj = DocumentAnnotation(fileNamePartitioned,'c20835db4b1b4e108828a8537ff41506','https://spor-sit.azure-api.net/pms/api/v2/',df,coll)
        try:
            pms_oms_annotation_data = documentAnnotationObj.processRegulatedAuthorizationForDoc()
            print(pms_oms_annotation_data)
        except:
            pms_oms_annotation_data = None
            print("Error Found")
            
        print(f"Completed Document Annotation")        
        flowLogger.logFlowCheckpoint("Completed Document Annotation")
        
        print(f"Starting Extracting Content Between Heading For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Extracting Content Between Heading")
        
        extractContentlogger =  MatchLogger(f'ExtractContentBetween_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        extractorObj = DataBetweenHeadingsExtractor(extractContentlogger, basePath, coll)
        dfExtractedHierRR = extractorObj.extractContentBetweenHeadings(fileNamePartitioned)
        
        print(f"Completed Extracting Content Between Heading")        
        flowLogger.logFlowCheckpoint("Completed Extracting Content Between Heading")
        
        xmlLogger =  MatchLogger(f'XmlGeneration_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        fhirXmlGeneratorObj = FhirXmlGenerator(xmlLogger, controlBasePath, basePath, pms_oms_annotation_data, stylesFilePath, medName)
        fileNameXml = fileNamePartitioned.replace('.json','.xml')
        generatedXml = fhirXmlGeneratorObj.generateXml(dfExtractedHierRR, fileNameXml)
        
        fhirServiceLogger =  MatchLogger(f'XML Submission Logger_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)

        fhirServiceObj = FhirService(fhirServiceLogger, basePath, generatedXml)
        fhirServiceObj.submitFhirXml()
        print(f"Created XML File For :- {fileNamePartitioned}")      

        #return df,coll,dfExtractedHierRR
    
    flowLogger.logFlowCheckpoint("Completed Processing Partitioned Jsons")


In [None]:
wordToHtmlConvertorObj = WordToHtmlConvertor()
wordToHtmlConvertorObj.convertWordToHTML()

In [10]:
# inputZipFolderPath = "F:\Projects\EMA\Repository\EMA EPI PoC\\function_code\\inputblob"
inputZipFolderPath = os.path.abspath(os.path.join('..'))
inputZipFolderPath = os.path.join(inputZipFolderPath, 'inputblob')
inputZipFileName = "Zynteglo~H~CAP~en~2021-05-19T07-52-54Z.zip"

In [16]:
fileNameQrd = 'qrd_canonical_model.csv'
fileNameMatchRuleBook = 'ruleDict.json'
fileNameDocumentTypeNames = 'documentTypeNames.json'
fsMountName = '/mounted'

info = inputZipFileName.split("~")

try:
    medName = info[0]
    domain = info[1]
    procedureType = info[2]
    languageCode = info[3]
    timestamp = info[4]
    timestamp = timestamp.replace(".zip","")

except Exception:
    raise f"Missing required info in the zip file name {inputZipFileName}"

if "\\" in os.getcwd():
    localEnv = True
    inputZipFolderPath = os.path.join(os.path.abspath(os.path.join('..')),inputZipFolderPath)
    outputFolderPath = os.path.join(os.path.abspath(os.path.join('..')), 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(os.path.abspath(os.path.join('..')),'control')
else:
    localEnv = False
    inputZipFolderPath = os.path.join(f'{fsMountName}',inputZipFolderPath)
    outputFolderPath = os.path.join(f'{fsMountName}', 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(f'{fsMountName}','control')


print(inputZipFileName, inputZipFolderPath, outputFolderPath, controlFolderPath)

mode = 0o666

if localEnv is True:
    inputZipFolderPath = inputZipFolderPath.replace("/","\\")
    outputFolderPath = outputFolderPath.replace("/","\\")
    controlFolderPath = controlFolderPath.replace("/","\\")

try:
    os.makedirs(inputZipFolderPath, mode)
    os.makedirs(outputFolderPath, mode)
    os.makedirs(controlFolderPath, mode)

except Exception:
    print("Already Present")
    
with zipfile.ZipFile(f'{inputZipFolderPath}/{inputZipFileName}',"r") as zip_ref:
        zip_ref.extractall(outputFolderPath)
    

_,_,fileNames = next(os.walk(outputFolderPath))
htmlFileName = [fileName for fileName in fileNames if ".htm" in fileName][0]

print(htmlFileName)



Zynteglo~H~CAP~en~2021-05-19T07-52-54Z.zip F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z F:\Projects\EMA\Repository\EMA EPI PoC\function_code\control
Already Present
Zynteglo_clean.htm


In [17]:
a,b,c = parseDocument(controlFolderPath, outputFolderPath, htmlFileName, fileNameQrd, fileNameMatchRuleBook, fileNameDocumentTypeNames, medName)

2021-05-18 23:30:47,797 : Flow Logger HTML_n : Starting HTML Conversion To Json | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:47,803 : Style Dictionary_v : Reading style dictionary in file: rule_dictionary_en.json | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:47,832 : Style Dictionary_v : Qrd Section Keys Retrieved For Style Dictionary: ANNEX I, ANNEX II, ANNEX III, B. PACKAGE LEAFLET | H | CAP |  en | HTML | Zynteglo_clean.htm


['F:', 'Projects', 'EMA', 'Repository', 'EMA EPI PoC', 'function_code', 'work', 'H', 'CAP', 'Zynteglo', 'en', '2021-05-19T07-52-54Z'] Zynteglo_clean.htm
2021-05-19T07-52-54Z en Zynteglo CAP H
------------- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt -----------------
F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\Zynteglo_clean.htm F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.json


2021-05-18 23:30:48,143 : Parser_D : Style Information Stored In File: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,675 : Parser_D : Writing to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.json | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,769 : Flow Logger HTML_n : Completed HTML Conversion To Json | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,769 : Flow Logger HTML_n : Starting Json Split | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,777 : Style Dictionary_V : Reading style dictionary in file: rule_dictionary_en.json | H | CAP |  en | Json | Zynteglo_clean.json
2021-05-18 23:30:49,804 : Style Dictionary_V : Qrd Section Keys Retrieved For Style Dictionary: ANNEX I, ANNEX II, ANNEX III, B. PACKAGE LEAFLET |

stylePath:- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt
PathJson F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.json


2021-05-18 23:30:49,956 : Flow Logger HTML_n : ['Zynteglo_clean_SmPC.json', 'Zynteglo_clean_ANNEX II.json', 'Zynteglo_clean_ANNEX III.json', 'Zynteglo_clean_ PACKAGE LEAFLET.json'] | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,956 : Flow Logger HTML_n : Completed Json Split | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,956 : Flow Logger HTML_n : Started Processing Partitioned Jsons | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,956 : Flow Logger HTML_n : 



||||||||||||||||||||||||||||||||0 ||||| Zynteglo_clean_SmPC.json||||||||||||||||||||||||||||||||



 | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:30:49,964 : Heading Extraction Zynteglo_clean_SmPC.json_B : Starting Heading Extraction | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:30:50,084 : Heading Extraction Zynteglo_clean_SmPC.json_B : Started Extracting Heading | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json


Starting Heading Extraction For File :- Zynteglo_clean_SmPC.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_SmPC.json
--------------------------------------------
SmPC


2021-05-18 23:30:50,180 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'SUMMARY OF PRODUCT CHARACTERISTICS' | Qrd txt :- 'SUMMARY OF PRODUCT CHARACTERISTICS' | Matched :- 'True'
2021-05-18 23:30:50,188 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed As This The First Heading | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20001' | prevHeadingCurrId :- '' | prevHeadingFoundId :- ''
2021-05-18 23:30:50,296 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed : checkLowerCase|2.88|(99, 100, 99)|0.919| | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'medicinal product subject additional monitoring. allow quick identification new safety information. healthcare professionals asked report suspected adverse reactions. see section 4.8 report adverse reactions.' | Qrd txt :- 'qthis medicinal product subject additional monitoring. allow quick identification new saf


OriginalCheck

----------------------------------
RemovedByStyle
----------------------------------


 dispersion for infusion. ' | Qrd txt :- '6.6 Special precautions for disposal <and other handling>' | Matched :- 'False'0.62| | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Zynteglo 1.2‑20 × 106 cells/mL
2021-05-18 23:30:50,568 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '2. QUALITATIVE AND QUANTITATIVE COMPOSITION' | Qrd txt :- '2. QUALITATIVE AND QUANTITATIVE COMPOSITION' | Matched :- 'True'
2021-05-18 23:30:50,576 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20004' | prevHeadingCurrId :- '20003' | prevHeadingFoundId :- '20003'
2021-05-18 23:30:50,594 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '2.1 General description' | Qrd txt :- '2.1 General description' | Matched :- 'True'
2021-05-18 23:30:50,602 : Heading Extraction Zynteg

 for full details on the administration process).' | Qrd txt :- '6.6 Special precautions for disposal <and other handling>' | Matched :- 'False'| 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Zynteglo is for intravenous use only (see section 6.6
 before Zynteglo infusion. ' | Qrd txt :- '6.5 Nature and contents of container <and special equipment for use administration or implantation>' | Matched :- 'False'SmPC.json | Doc txt :- 'After completion of the 4‑day course of
 transplantation should be followed after Zynteglo infusion.' | Qrd txt :- '6.5 Nature and contents of container <and special equipment for use administration or implantation>' | Matched :- 'False' procedures for patient management after HSC
2021-05-18 23:30:53,854 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '4.3 Contraindications' | Qrd txt :- '4.3 Contraindications' | Matched :- 'True'
2021-05-18 23:30:53,862 : Heading Extraction Zynteglo_clean

 children has not been studied.' | Qrd txt :- '6.5 Nature and contents of container <and special equipment for use administration or implantation>' | Matched :- 'False'C.json | Doc txt :- 'It is unknown whether Zynteglo is excreted in human
 breast-feeding.' | Qrd txt :- '6.6 Special precautions for disposal <and other handling>' | Matched :- 'False', 26, 29)|0.607| | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Zynteglo must not be administered to women who are
2021-05-18 23:30:57,111 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Fertility' | Qrd txt :- 'Fertility' | Matched :- 'True'
2021-05-18 23:30:57,119 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20025' | prevHeadingCurrId :- '20022' | prevHeadingFoundId :- '20022'
 animal studies. ' | Qrd txt :- '6.5 Nature and contents of container <and special equipm

2021-05-18 23:30:59,622 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20033' | prevHeadingCurrId :- '20032' | prevHeadingFoundId :- '20032'
2021-05-18 23:30:59,806 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Mechanism of action' | Qrd txt :- 'Mechanism of action' | Matched :- 'True'
2021-05-18 23:30:59,814 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20034' | prevHeadingCurrId :- '20033' | prevHeadingFoundId :- '20033'
2021-05-18 23:30:59,886 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Pharmacodynamic effects' | Qrd txt :- 'Pharmacodynamic effects' | Matched :- 'True'
2021-05-18 23:30:59,902 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed 

2021-05-18 23:31:03,585 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20049' | prevHeadingCurrId :- '20048' | prevHeadingFoundId :- '20048'
 medicinal product must not be mixed with other medicinal products.' | Qrd txt :- '6.5 Nature and contents of container <and special equipment for use administration or implantation>' | Matched :- 'False'of compatibility studies, this
 medicinal product must not be mixed with other medicinal products.' | Qrd txt :- '6.6 Special precautions for disposal <and other handling>' | Matched :- 'False'an_SmPC.json | Doc txt :- 'In the absence of compatibility studies, this
2021-05-18 23:31:03,777 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '6.3 Shelf life' | Qrd txt :- '6.3 Shelf life' | Matched :- 'True'
2021-05-18 23:31:03,785 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passe

2021-05-18 23:31:07,285 : Heading Extraction Zynteglo_clean_SmPC.json_B : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '10. DATE OF REVISION OF THE TEXT' | Qrd txt :- '10. DATE OF REVISION OF THE TEXT' | Matched :- 'True'
2021-05-18 23:31:07,293 : Heading Extraction Zynteglo_clean_SmPC.json_B : Validation Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20058' | prevHeadingCurrId :- '20057' | prevHeadingFoundId :- '20057'
 of {name of MS Agency (link)}>.  ' | Qrd txt :- '6.5 Nature and contents of container <and special equipment for use administration or implantation>' | Matched :- 'False'son | Doc txt :- 'Detailed
2021-05-18 23:31:07,501 : Flow Logger HTML_n : Completed Heading Extraction For File | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:07,501 : Flow Logger HTML_n : Starting Document Annotation For File | H | CAP |  en | HTML | Zynteglo_clean.htm




Heading Not Found 
 ['qThis medicinal product is subject to additional monitoring. This will allow quick identification of new safety information. Healthcare professionals are asked to report any suspected adverse reactions. See section 4.8 for how to report adverse reactions.', 'Precautions to be taken before handling or administering the medicinal product', 'Clinical efficacy and safety', 'Absorption', 'Distribution', 'Biotransformation', 'Elimination', 'Linearity/non-linearity', 'Pharmacokinetic/pharmacodynamic relationship(s)', 'Environmental risk assessment (ERA)', 'Use in the paediatric population', 'DOSIMETRY', 'INSTRUCTIONS FOR PREPARATION OF RADIOPHARMACEUTICALS']


dict_keys(['qThis medicinal product is subject to additional monitoring. This will allow quick identification of new safety information. Healthcare professionals are asked to report any suspected adverse reactions. See section 4.8 for how to report adverse reactions.'])
Completed Heading Extraction For File
Start

2021-05-18 23:31:08,682 : Flow Logger HTML_n : Completed Document Annotation | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:08,682 : Flow Logger HTML_n : Starting Extracting Content Between Heading | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:08,690 : ExtractContentBetween_0_6 : Cleaning Match Results | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:31:08,690 : ExtractContentBetween_0_6 : Finished Cleaning Match Results | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:31:08,722 : Flow Logger HTML_n : Completed Extracting Content Between Heading | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:08,754 : XmlGeneration_0_7 : PMS/OMS Annotation Information Not Retrieved | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:31:08,762 : XmlGeneration_0_7 : Initiating XML Generation | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:31:08,890 : XmlGeneration_0_7 : Writing to File:Zynteglo_clean_SmPC.xml | H | CA

Error Found
Completed Document Annotation
Starting Extracting Content Between Heading For File :- Zynteglo_clean_SmPC.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_SmPC.json
--------------------------------------------
Completed Extracting Content Between Heading
Already Exists


2021-05-18 23:31:12,054 : XML Submission Logger_0_t : Initiating Submission To FHIR Server | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:31:12,054 : XML Submission Logger_0_t : Response{"resourceType":"Bundle","id":"052c4a1b-e471-450b-ba39-509dd709bc59","meta":{"versionId":"1","lastUpdated":"2021-05-18T18:01:11.069+00:00"},"type":"collection","entry":[{"fullUrl":"urn:uuid:6bac3caa-21bc-48ef-a2ba-3665312a6e20","resource":{"resourceType":"Bundle","id":"2ec04462-a7ba-46c1-ae7c-e1da904145da","identifier":{"system":"http://ema.europa.eu/fhir/identifier/documentid","value":"${instance.bundle[n].Identifier}"},"type":"document","timestamp":"2021-05-18T18:01:08+00:00","entry":[{"fullUr | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:31:12,062 : XML Submission Logger_0_t : POST sucessful: XML added with id: 052c4a1b-e471-450b-ba39-509dd709bc59 | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-18 23:31:12,062 : Flow Logger HTML_n : 



||||||||||||||||||||||||||||

POST sucessful: XML added with id 052c4a1b-e471-450b-ba39-509dd709bc59
Created XML File For :- Zynteglo_clean_SmPC.json
Starting Heading Extraction For File :- Zynteglo_clean_ANNEX II.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_ANNEX II.json
--------------------------------------------
AnnexII


2021-05-18 23:31:12,275 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Matched :- 'True'
2021-05-18 23:31:12,283 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Flow Is Broken | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | currHeadId :- '21009' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '21007'
2021-05-18 23:31:12,287 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | currHeadId :- '21009' | prevHeadingCurrId :- '21007' | prevHeadingFoundId :- '21007'
 AUTHORISATION' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | M


OriginalCheck


OriginalCheck


OriginalCheck

oooooooooooooooooooooooooooooooooooooooo END OF Sub Section oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo


 RELEASE' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | Matched :- 'True'nteglo_clean_ANNEX II.json | Doc txt :- 'A.      MANUFACTURER(S) OF THE
2021-05-18 23:31:12,574 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Passed As This The First Heading | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | currHeadId :- '21002' | prevHeadingCurrId :- '' | prevHeadingFoundId :- ''
 biological active substance(s)' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | Matched :- 'False'ean_ANNEX II.json | Doc txt :- 'Name and address of the manufacturer(s) of the
2021-05-18 23:31:12,598 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'Name and address of the manufacturer(s) of the biological active substance(s)' | Qrd txt :- 'Name and address of 


OriginalCheck


OriginalCheck



2021-05-18 23:31:12,767 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'Name and address of the manufacturer(s) responsible for batch release' | Qrd txt :- 'Name and address of the manufacturer(s) responsible for batch release' | Matched :- 'True'
2021-05-18 23:31:12,783 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | currHeadId :- '21004' | prevHeadingCurrId :- '21003' | prevHeadingFoundId :- '21003'
2021-05-18 23:31:12,927 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'B. CONDITIONS OR RESTRICTIONS REGARDING SUPPLY AND USE' | Qrd txt :- 'B. CONDITIONS OR RESTRICTIONS REGARDING SUPPLY AND USE' | Matched :- 'True'
2021-05-18 23:31:12,935 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Passed | H | CAP |  en | 1 | Zynteglo_clean


OriginalCheck


OriginalCheck



 product within 6 months following authorisation. ' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Matched :- 'False' | Doc txt :- 'The
 WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT  ' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | Matched :- 'False'RESTRICTIONS
2021-05-18 23:31:13,318 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Matched :- 'True'
2021-05-18 23:31:13,326 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | currHeadId :- '21009' | prevHeadin


OriginalCheck



 risk minimisation measures ' | Qrd txt :- 'Additional risk minimisation measures' | Matched :- 'True'(99, 100, 100)|0.995| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '·Additional
2021-05-18 23:31:13,566 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | currHeadId :- '21011' | prevHeadingCurrId :- '21010' | prevHeadingFoundId :- '21010'
 contain:' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FORIn Lowercase : Contains<>|302.0|(6, 12, 86)|0.331| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'The physician
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'
 Summary of Product Characteristics ' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Matched :- 'False'n_ANNEX II.json | Doc txt :- 'oThe



OriginalCheck


OriginalCheck



2021-05-18 23:31:14,159 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Failed In Lowercase : Contains<>|202.74|(8, 8, 86)|0.367| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '·The Guide for healthcare professionals shall contain the following key elements:' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FOR
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'



OriginalCheck



 of patient’s guide' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | Matched :- 'False'Zynteglo_clean_ANNEX II.json | Doc txt :- '–content
 of patient’s guide' | Qrd txt :- 'Name and address of the manufacturer(s) of the biological active substance(s)' | Matched :- 'False'7| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '–content
 of patient’s guide' | Qrd txt :- 'Name and address of the manufacturer(s) responsible for batch release' | Matched :- 'False'86)|0.527| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '–content
 professional' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FORowercase : Contains<>|214.49|(8, 10, 86)|0.396| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '–the
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'



OriginalCheck


OriginalCheck


OriginalCheck


OriginalCheck


OriginalCheck



 in the drug product Registry.' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Matched :- 'False'_clean_ANNEX II.json | Doc txt :- '–enrolment
 in the drug product Registry.' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FORns<>|377.5|(6, 10, 86)|0.359| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '–enrolment
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'



OriginalCheck



2021-05-18 23:31:15,360 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Failed In Lowercase : Contains<>|141.35|(7, 9, 86)|0.377| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '·The Guide to handling and method of administration for healthcare professionals shall contain the following key elements:' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FOR
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'



OriginalCheck


OriginalCheck



 about the thawing of Zynteglo' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FORns<>|351.16|(6, 12, 86)|0.337| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'oInstructions
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'
 should contain:' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FORrcase : Contains<>|343.18|(6, 11, 86)|0.336| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'The
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'



OriginalCheck



2021-05-18 23:31:15,921 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Match Failed In Lowercase : Contains<>|240.32|(7, 11, 86)|0.352| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '·The patient/carer guide shall contain the following key messages:' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FOR
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'



OriginalCheck



 in the drug product Registry.' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Matched :- 'False'clean_ANNEX II.json | Doc txt :- 'oEnrolment
 in the drug product Registry.' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FORns<>|375.0|(7, 12, 86)|0.375| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'oEnrolment
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'
 shall contain the following key messages:' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FOR(8, 11, 86)|0.365| | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '·The
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'



OriginalCheck


OriginalCheck


OriginalCheck



 to conduct post-authorisation measures ' | Qrd txt :- 'Obligation to conduct post-authorisation measures' | Matched :- 'True' | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- '·Obligation
2021-05-18 23:31:17,203 : Heading Extraction Zynteglo_clean_ANNEX II.json_z : Validation Passed | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json | currHeadId :- '21012' | prevHeadingCurrId :- '21011' | prevHeadingFoundId :- '21011'
 MAH shall complete, within the stated timeframe, the below measures:' | Qrd txt :- 'E. SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FORP |  en | 1 | Zynteglo_clean_ANNEX II.json | Doc txt :- 'The
<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>' | Matched :- 'False'
 AUTHORISATION' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | Matched :- 'False'_clean_ANNEX II.json | Doc txt :- 'E.    SPECIFIC OBLIGATION TO
 AUT


OriginalCheck


OriginalCheck



2021-05-18 23:31:17,535 : Flow Logger HTML_n : Completed Heading Extraction For File | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:17,535 : Flow Logger HTML_n : Starting Document Annotation For File | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:17,535 : Flow Logger HTML_n : Completed Document Annotation | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:17,535 : Flow Logger HTML_n : Starting Extracting Content Between Heading | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:17,543 : ExtractContentBetween_1_5 : Cleaning Match Results | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json
2021-05-18 23:31:17,543 : ExtractContentBetween_1_5 : Finished Cleaning Match Results | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json
2021-05-18 23:31:17,559 : Flow Logger HTML_n : Completed Extracting Content Between Heading | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:31:17,567 : XmlGeneration_1_r : PMS/OMS Annotation Information Not Retr



Heading Not Found 
 ['Official batch release']


dict_keys([])
Completed Heading Extraction For File
Starting Document Annotation For File :- Zynteglo_clean_ANNEX II.json
Error Found
Completed Document Annotation
Starting Extracting Content Between Heading For File :- Zynteglo_clean_ANNEX II.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_ANNEX II.json
--------------------------------------------
Completed Extracting Content Between Heading
Already Exists


2021-05-18 23:31:19,610 : XML Submission Logger_1_U : Initiating Submission To FHIR Server | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json
2021-05-18 23:31:19,610 : XML Submission Logger_1_U : Response{"resourceType":"Bundle","id":"1dc79a78-e183-4630-80e3-89bea9cc40cf","meta":{"versionId":"1","lastUpdated":"2021-05-18T18:01:18.931+00:00"},"type":"collection","entry":[{"fullUrl":"urn:uuid:860e5d3b-6d65-4fae-8df9-82a14939c028","resource":{"resourceType":"Bundle","id":"a6722de3-0e02-4b65-88bb-dcb3ebd076b1","identifier":{"system":"http://ema.europa.eu/fhir/identifier/documentid","value":"${instance.bundle[n].Identifier}"},"type":"document","timestamp":"2021-05-18T18:01:17+00:00","entry":[{"fullUr | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json
2021-05-18 23:31:19,626 : XML Submission Logger_1_U : POST sucessful: XML added with id: 1dc79a78-e183-4630-80e3-89bea9cc40cf | H | CAP |  en | 1 | Zynteglo_clean_ANNEX II.json
2021-05-18 23:31:19,626 : Flow Logger HTML_n : 



||||||||||||||||

POST sucessful: XML added with id 1dc79a78-e183-4630-80e3-89bea9cc40cf
Created XML File For :- Zynteglo_clean_ANNEX II.json
Starting Heading Extraction For File :- Zynteglo_clean_ANNEX III.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_ANNEX III.json
--------------------------------------------
Labelling


 LABELLING' | Qrd txt :- 'LABELLING ' | Matched :- 'True'an_ANNEX III.json_K : Match Passed : <=1|25.0|(86, 100, 95)|0.921| | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- 'A.
2021-05-18 23:31:19,866 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Validation Passed As This The First Heading | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | currHeadId :- '22001' | prevHeadingCurrId :- '' | prevHeadingFoundId :- ''
 TO APPEAR ON THE OUTER PACKAGING – METAL CASSETTE' | Qrd txt :- 'PARTICULARS TO APPEAR ON <THE OUTER PACKAGING> <AND> <THE IMMEDIATE PACKAGING>' | Matched :- 'False'n_ANNEX III.json | Doc txt :- 'PARTICULARS
2021-05-18 23:31:19,978 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Match Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '1. NAME OF THE MEDICINAL PRODUCT' | Qrd txt :- '1. NAME OF THE MEDICINAL PRODUCT' | Matched :- 'True'
2021-05-18 23:31:19,986 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Validat

2021-05-18 23:31:22,274 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Validation Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | currHeadId :- '22017' | prevHeadingCurrId :- '22016' | prevHeadingFoundId :- '22016'
2021-05-18 23:31:22,328 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Match Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '16. INFORMATION IN BRAILLE' | Qrd txt :- '16. INFORMATION IN BRAILLE' | Matched :- 'True'
2021-05-18 23:31:22,336 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Validation Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | currHeadId :- '22018' | prevHeadingCurrId :- '22017' | prevHeadingFoundId :- '22017'
2021-05-18 23:31:22,480 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Match Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '17. UNIQUE IDENTIFIER – 2D BARCODE' | Qrd txt :- '17. UNIQUE IDENTIFIER – 2D BARCODE' | Matched :- 'True'
2021-05-18 23:31


OriginalCheck



2021-05-18 23:31:23,434 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Match Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '3. EXPIRY DATE' | Qrd txt :- '3. EXPIRY DATE' | Matched :- 'True'
2021-05-18 23:31:23,442 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Validation Failed As Wrong Heading Found | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | currHeadId :- '22024' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '22029'
2021-05-18 23:31:23,458 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Match Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '3. EXPIRY DATE' | Qrd txt :- '3. EXPIRY DATE' | Matched :- 'True'
2021-05-18 23:31:23,466 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Validation Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | currHeadId :- '22030' | prevHeadingCurrId :- '22029' | prevHeadingFoundId :- '22029'
 DONATION AND PRODUCT CODES' | Qrd txt :- '13. BATCH NUMBER<, 

oooooooooooooooooooooooooooooooooooooooo END OF Sub Section oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo


2021-05-18 23:31:24,616 : Heading Extraction Zynteglo_clean_ANNEX III.json_K : Validation Passed | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | currHeadId :- '22004' | prevHeadingCurrId :- '22003' | prevHeadingFoundId :- '22003'
 CODES' | Qrd txt :- '13. BATCH NUMBER<, DONATION AND PRODUCT CODES>' | Matched :- 'False'd : Contains<>|72.41|(70, 84, 84)|0.848| | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '3.       DONATION AND PRODUCT
 CODES' | Qrd txt :- '4. BATCH NUMBER<, DONATION AND PRODUCT CODES>' | Matched :- 'False'ed : Contains<>|72.41|(69, 84, 84)|0.834| | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '3.       DONATION AND PRODUCT
 CODES' | Qrd txt :- '4. BATCH NUMBER<, DONATION AND PRODUCT CODES>' | Matched :- 'False'ed : Contains<>|72.41|(69, 84, 84)|0.834| | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json | Doc txt :- '3.       DONATION AND PRODUCT
 NUMBER, CONTENTS BY WEIGHT, BY VOLUME OR BY UNIT, AND EXPIRY DATE' | Qrd txt :- '4



Heading Not Found 
 ['PARTICULARS TO APPEAR ON <THE OUTER PACKAGING> <AND> <THE IMMEDIATE PACKAGING>', 'MINIMUM PARTICULARS TO APPEAR ON BLISTERS OR STRIPS', 'NAME OF THE MARKETING AUTHORISATION HOLDER']


dict_keys([])
Completed Heading Extraction For File
Starting Document Annotation For File :- Zynteglo_clean_ANNEX III.json
Error Found
Completed Document Annotation
Starting Extracting Content Between Heading For File :- Zynteglo_clean_ANNEX III.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_ANNEX III.json
--------------------------------------------
Completed Extracting Content Between Heading
Already Exists


2021-05-18 23:31:31,744 : XML Submission Logger_2_W : Initiating Submission To FHIR Server | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json
2021-05-18 23:31:31,744 : XML Submission Logger_2_W : Response{"resourceType":"OperationOutcome","id":"99285350-ebb2-4d43-a5c7-5496e35edf52","issue":[{"severity":"error","code":"exception","diagnostics":"There was an error processing your request."}]} | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json
2021-05-18 23:31:31,753 : XML Submission Logger_2_W : HTTP error occurred: 500 Server Error: Internal Server Error for url: https://ema-dap-epi-dev-fhir-api.azurewebsites.net/Bundle | H | CAP |  en | 2 | Zynteglo_clean_ANNEX III.json
Traceback (most recent call last):
  File "F:\Projects\EMA\Repository\EMA EPI PoC\function_code\code\fhirService\fhirService.py", line 60, in submitFhirXml
    response.raise_for_status()
  File "C:\Users\vipsharm\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\models.py", line 940, in raise_for_status
  

HTTP error occurred: 500 Server Error: Internal Server Error for url: https://ema-dap-epi-dev-fhir-api.azurewebsites.net/Bundle
Error log: There was an error processing your request.
Created XML File For :- Zynteglo_clean_ANNEX III.json
Starting Heading Extraction For File :- Zynteglo_clean_ PACKAGE LEAFLET.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_ PACKAGE LEAFLET.json
--------------------------------------------
Package leaflet


 leaflet: Information for the patient or carer' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False' | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Package
 1.2‑20 × 106 cells/mL dispersion for infusion' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'.548| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Zynteglo
 1.2‑20 × 106 cells/mL dispersion for infusion' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False' | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Zynteglo
 autotemcel ' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False' SpecialCase3|170.83|(16, 30, 32)|0.478| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'betibeglogene
 autotemcel ' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'Fals

----------------------------------
RemovedByStyle
----------------------------------


 effects you may get. See the end of section 4 for how to report side effects.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'n_ PACKAGE LEAFLET.json | Doc txt :- 'This
 for you.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'd : SpecialCase3|80.83|(13, 30, 32)|0.543| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Read all of this leaflet carefully
 for you.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'alCase1|74.17|(28, 36, 86)|0.479| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Read all of this leaflet carefully
 leaflet. You may need to read it again. ' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'5)|0.571| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-Keep this
 leaflet. You may need to read it again. ' | Qrd txt :- 'This l

----------------------------------
RemovedByStyle
----------------------------------
----------------------------------
RemovedByStyle
----------------------------------


 you need to know before you are given Zynteglo' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'568| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '2.     What
 you need to know before you are given Zynteglo' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False' CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '2.     What
 Zynteglo is given' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'alCase3|179.17|(11, 21, 22)|0.508| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '3.     How
 Zynteglo is given' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'08.33|(14, 33, 33)|0.515| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '3.     How
 side effects ' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False

----------------------------------
RemovedByStyle
----------------------------------
----------------------------------
RemovedByStyle
----------------------------------


 of the pack and other information' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'9, 32, 40)|0.596| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '6.     Contents
2021-05-18 23:31:34,952 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Match Passed | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '6. Contents of the pack and other information' | Qrd txt :- '6. Contents of the pack and other information' | Matched :- 'True'
2021-05-18 23:31:34,960 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Validation Flow Is Broken | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23023' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23003'
2021-05-18 23:31:34,968 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Validation Passed | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23023' | prevHeadingCurrId :- '23003' 

----------------------------------
RemovedByStyle
----------------------------------


 you:' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'pecialCase1|126.32|(23, 26, 31)|0.514| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'You must not be given Zynteglo if
 any of the ingredients of this medicine (listed in section 6)' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'|  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-are allergic to
 any of the ingredients of this medicine (listed in section 6)' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False' | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-are allergic to
 breast-feeding' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'ecialCase3|100.0|(54, 65, 86)|0.656| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-are pregnant or
 breast-feeding' | Qrd txt :- 'This leaflet was last revised i

2021-05-18 23:31:39,421 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Validation Passed | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23009' | prevHeadingCurrId :- '23007' | prevHeadingFoundId :- '23007'
 have recently taken or might take any other medicines.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False' | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Tell your doctor if you are taking,
 have recently taken or might take any other medicines.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Tell your doctor if you are taking,
 infusion (see also section 3, How Zynteglo is given).' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False' | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'You should not take hydroxyurea (a
 infus

2021-05-18 23:31:43,956 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Validation Flow Is Broken | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23019' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23012'
2021-05-18 23:31:43,964 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Validation Passed | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23019' | prevHeadingCurrId :- '23012' | prevHeadingFoundId :- '23012'
 gets them. ' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False' SpecialCase3|82.42|(26, 34, 32)|0.566| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Like
 gets them. ' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'ase1|82.42|(19, 19, 86)|0.53| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Like
 used to prepare your bone marrow for treatment with Zynteglo.

 discomfort' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False': SpecialCase3|164.0|(29, 40, 45)|0.522| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·chest pain or
 discomfort' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'Case1|200.0|(24, 28, 30)|0.564| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·chest pain or
2021-05-18 23:31:47,518 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Match Failed : SpecialCase3|409.09|(13, 36, 43)|0.449| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·other pain' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'
2021-05-18 23:31:47,558 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Match Failed : SpecialCase1|490.91|(11, 36, 43)|0.496| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·other pain' | Q

 more than 1 in 10 people)' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'4, 27, 47)|0.493| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Very
 signs of a serious liver condition called veno-occlusive disease.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'n | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·Pain in the
 signs of a serious liver condition called veno-occlusive disease.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'nteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·Pain in the
 vaginal bleeding.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'alCase3|77.57|(23, 36, 37)|0.56| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·Prolonged
 vaginal bleeding.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :-

2021-05-18 23:31:53,481 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Match Failed : SpecialCase3|450.0|(10, 43, 43)|0.566| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'itchy skin' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'
 of the digestive tract lining which runs from the mouth to the anus' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'| 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·soreness and swelling
 of the digestive tract lining which runs from the mouth to the anus' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'teglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·soreness and swelling
 affect up to 1 in 10 people)' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'68|(23, 30, 30)|0.504| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Com

 for Aspergillus (lung disease caused by fungus)' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'5| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·positive test
 for Aspergillus (lung disease caused by fungus)' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False' CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·positive test
 abnormalities in heart rhythm' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'3|(28, 38, 33)|0.646| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·changes and
 abnormalities in heart rhythm' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False', 32)|0.552| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·changes and
 in back, bone, skin, limbs, anus, or muscles' | Qrd txt :- 'Pregnancy <and> <,> breast-feedi

 (haemorrhoids)' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'1|238.1|(25, 38, 33)|0.453| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·piles
 pressure' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'd : SpecialCase3|231.58|(14, 32, 33)|0.403| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·low blood
 pressure' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'alCase1|278.95|(15, 32, 30)|0.437| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·low blood
 temperature' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False' SpecialCase3|204.76|(20, 29, 27)|0.493| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·low body
 temperature' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'Fal

 disorder' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'd : SpecialCase3|190.48|(28, 48, 45)|0.569| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·sweat gland
 disorder' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'alCase1|228.57|(28, 48, 40)|0.501| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·sweat gland
 reaction' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'd : SpecialCase3|195.24|(28, 33, 32)|0.506| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·transfusion
 reaction' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'alCase1|242.86|(25, 43, 36)|0.523| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·transfusion
 decreased' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- '

 breath' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'led : SpecialCase3|210.0|(26, 40, 45)|0.474| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·shortness of
 breath' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'cialCase1|255.0|(23, 30, 29)|0.503| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·shortness of
 due to a heart problem' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'e3|105.26|(30, 36, 38)|0.586| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·chest pain not
 due to a heart problem' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'|(23, 24, 27)|0.575| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '·chest pain not
 (redness and warmth of skin)' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and f

 cells (blood stem cells) per millilitre.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'33)|0.513| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-The active
 cells (blood stem cells) per millilitre.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-The active
 2, Sodium content.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'lCase3|81.51|(19, 28, 86)|0.526| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-The other ingredients
 2, Sodium content.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'.83|(22, 28, 32)|0.518| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '-The other ingredients
 Zynteglo looks like and contents of the pack' | Qrd txt :- 'Pregnancy <and> <

2021-05-18 23:32:11,280 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Match Failed : SpecialCase3|358.33|(13, 25, 36)|0.532| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Haidgraben 5' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'
2021-05-18 23:32:11,320 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Match Failed : SpecialCase1|450.0|(14, 33, 30)|0.476| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Haidgraben 5' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'
2021-05-18 23:32:11,352 : Heading Extraction Zynteglo_clean_ PACKAGE LEAFLET.json_n : Match Failed : SpecialCase3|300.0|(15, 27, 24)|0.34| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- '85521 Ottobrunn' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'
2021-05-18 23:32:11,400 : Heading Extraction Z

----------------------------------
RemovedByStyle
----------------------------------


 evidence to come about this medicine. ' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False', 33)|0.534| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'This
 evidence to come about this medicine. ' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False'1| | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'This
 every year and this leaflet will be updated as necessary.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'The
 every year and this leaflet will be updated as necessary.' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'False' | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'The
 of MS Agency (link)}>.' | Qrd txt :- 'Pregnancy <and> <,> breast-feeding <and fertility>' | Matched :- 'False'e3|84.21|(17, 28, 86

2021-05-18 23:32:19,518 : Flow Logger HTML_n : Completed Heading Extraction For File | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:32:19,518 : Flow Logger HTML_n : Starting Document Annotation For File | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:32:19,526 : Flow Logger HTML_n : Completed Document Annotation | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:32:19,526 : Flow Logger HTML_n : Starting Extracting Content Between Heading | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:32:19,526 : ExtractContentBetween_3_l : Cleaning Match Results | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json
2021-05-18 23:32:19,534 : ExtractContentBetween_3_l : Finished Cleaning Match Results | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json
2021-05-18 23:32:19,550 : Flow Logger HTML_n : Completed Extracting Content Between Heading | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-18 23:32:19,566 : XmlGeneration_3_e : PMS/OMS Annotation Info



Heading Not Found 
 ['q This medicine is subject to additional monitoring. This will allow quick identification of new safety information. You can help by reporting any side effects you may get. See the end of section 4 for how to report side effects.', 'Do not <take> <use> X', 'Children <and adolescents>', 'X with <food> <and> <,> <drink> <and> <alcohol>', 'X contains {name the excipient(s)}', 'How to <take> <use> X ', 'Use in children <and adolescents>', 'If you <take> <use> more X than you should', 'If you forget to <take> <use> X>', 'If you stop <taking> <using> X>', 'Additional side effects in children <and adolescents>', 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.', 'Other sources of information']


dict_keys(['q This medicine is subject to additional monitoring. This will allow quick identification of new safety information. You can help by reporting any side effects you may get. See the end of section 4 for how to report side effects.', '1. What Zynteglo is a

2021-05-18 23:32:22,310 : XML Submission Logger_3_t : Initiating Submission To FHIR Server | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json
2021-05-18 23:32:22,314 : XML Submission Logger_3_t : Response{"resourceType":"Bundle","id":"44e0ecad-f1a2-4339-a584-55ad47f2c601","meta":{"versionId":"1","lastUpdated":"2021-05-18T18:02:21.681+00:00"},"type":"collection","entry":[{"fullUrl":"urn:uuid:1dac9f6e-0f09-4246-a421-a42c2247f7d6","resource":{"resourceType":"Bundle","id":"dd13ccb5-13c9-41a9-9156-2eb20d813ae8","identifier":{"system":"http://ema.europa.eu/fhir/identifier/documentid","value":"${instance.bundle[n].Identifier}"},"type":"document","timestamp":"2021-05-18T18:02:19+00:00","entry":[{"fullUr | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json
2021-05-18 23:32:22,326 : XML Submission Logger_3_t : POST sucessful: XML added with id: 44e0ecad-f1a2-4339-a584-55ad47f2c601 | H | CAP |  en | 3 | Zynteglo_clean_ PACKAGE LEAFLET.json
2021-05-18 23:32:22,326 : Flow Logger HTML_

POST sucessful: XML added with id 44e0ecad-f1a2-4339-a584-55ad47f2c601
Created XML File For :- Zynteglo_clean_ PACKAGE LEAFLET.json


TypeError: cannot unpack non-iterable NoneType object

In [9]:
a

Unnamed: 0,Bold,Classes,Element,HasBorder,ID,Indexed,IsHeadingType,IsListItem,IsPossibleHeading,Italics,ParentId,Styles,Text,Underlined,Uppercased,StringLength
0,False,['WordSection1'],"<div class=""WordSection1""> <p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p> <p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p> <p class=""MsoNormal"" style=...",False,8fc90752-ec33-4215-99a3-8711ebe75633,False,,False,False,False,55adea44-12a5-4e30-801d-307fadde0f35,,...,False,False,0
1,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,a3e0a75e-ace0-4f81-88d8-03ae112aecc9,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
2,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,d9317af2-2971-4d70-a655-a7a31caf7c20,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
3,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,f43032db-d077-4fec-a526-0483819905e4,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
4,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,39eb0909-d6d3-4461-8ef2-6efc5007a7d9,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
5,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,01a2b620-5f4a-4590-a08c-7566a4e5fd62,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
6,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,1d22d9d3-79f7-4b34-86d0-6a439d7e9f30,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
7,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,8a80fed1-b3f1-4ed9-bf09-9df7f7e75280,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
8,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,3450fbac-4745-43b1-baa3-59c7bb10378e,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0
9,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-bottom:0in;line-height:normal""></p>",False,5ca680b8-2583-4976-aa18-46996e2fce99,False,,False,False,False,8fc90752-ec33-4215-99a3-8711ebe75633,margin-bottom:0in;line-height:normal,,False,False,0


In [14]:
convertCollectionToDataFrame(b)

Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex,htmlId,SubSectionIndex,doc_parent_id
0,680,20001,CAP,,SUMMARY OF PRODUCT CHARACTERISTICS,,SUMMARY\r OF PRODUCT CHARACTERISTICS,26,7b37353a-5e3e-400a-9acf-6b44224c82e4,0,
1,682,20003,CAP,1.0,NAME OF THE MEDICINAL PRODUCT,20001.0,1. NAME OF THE\r MEDICINAL PRODUCT,33,62ecb297-5186-4c91-8144-6540859f879b,0,20001.0
2,683,20004,CAP,2.0,QUALITATIVE AND QUANTITATIVE COMPOSITION,20001.0,2. QUALITATIVE AND\r QUANTITATIVE COMPOSITION,38,5003c211-0a0d-4c38-8258-71d672ed3498,0,20001.0
3,684,20005,CAP,2.1,General description,20004.0,2.1 General description,40,ae7ffbb8-396b-4813-95f5-9f3eb67bcfa3,0,20004.0
4,685,20006,CAP,2.2,Qualitative and quantitative composition,20004.0,2.2 Qualitative and\r quantitative composition,44,054b75a3-b7a0-4877-b8f4-65cd9f0b5bc7,0,20004.0
5,686,20007,CAP,,Excipient(s) with known effect,20006.0,Excipient with known effect,50,708fa882-ab8b-49b7-8745-a2757401ec2f,0,20006.0
6,687,20008,CAP,3.0,PHARMACEUTICAL FORM,20001.0,3. PHARMACEUTICAL\r FORM,57,cd25c04e-2fec-477f-a67d-b894d46f03c9,0,20001.0
7,688,20009,CAP,4.0,CLINICAL PARTICULARS,20001.0,4. CLINICAL\r PARTICULARS,64,12a07622-b5ed-483d-bda8-96814b599bc9,0,20001.0
8,689,20010,CAP,4.1,Therapeutic indications,20009.0,4.1 Therapeutic\r indication,66,4dcf34cc-26ef-4720-87fa-435048832ef7,0,20009.0
9,690,20011,CAP,4.2,Posology and method of administration,20009.0,4.2 Posology and\r method of administration,70,68b9ace8-fc3c-429f-92b8-e983e9d1e388,0,20009.0


In [13]:
def convertToInt(x):
    try:
        return str(int(x))
    except:
        return x


def convertCollectionToDataFrame(collection):

    dfExtractedHier = pd.DataFrame(collection)
    dfExtractedHier['parent_id'] = dfExtractedHier['parent_id'].apply(
        lambda x: convertToInt(x))
    dfExtractedHier['id'] = dfExtractedHier['id'].apply(
        lambda x: convertToInt(x))

    return dfExtractedHier

In [7]:
import jellyfish

In [22]:
jaroWinklerScore = jellyfish.jaro_winkler_similarity('2.2 qualitative and quantitative composition'.title(), '2. qualitative and quantitative composition'.title())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vipsharm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
str(round(jaroWinklerScore, 3))

'0.994'