In [84]:
import os
import zipfile
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
import tracemalloc
import psutil
import pprint
import pandas as pd
import uuid
import json
import os
import glob
import re
import sys
from bs4 import NavigableString, BeautifulSoup
from collections import defaultdict
import random
import string
import time

from utils.config import config
from utils.logger.logger import loggerCreator

# ePI Modules
from parse.rulebook.rulebook import StyleRulesDictionary

from parse.extractor.parser import parserExtractor
from match.matchDocument.matchDocument import MatchDocument
from documentAnnotation.documentAnnotation import DocumentAnnotation
from htmlDocTypePartitioner.partition import DocTypePartitioner
from extractContentBetweenHeadings.dataBetweenHeadingsExtractor import DataBetweenHeadingsExtractor
from fhirXmlGenerator.fhirXmlGenerator import FhirXmlGenerator
from fhirService.fhirService import FhirService
from utils.logger.matchLogger import MatchLogger
from languageInfo.documentTypeNames.documentTypeNames import DocumentTypeNames


class FolderNotFoundError(Exception):
    pass

class Metrics:
    
    def __init__(self, logFileName, logger):
        self.logFileName = logFileName
        self.start()
        self.writer = open(self.logFileName, 'a')
        self.writer.write("StepName,Time,Current Memory,Peak Memory,Used Ram Percentage\n")
        self.finalPeak = 0
        self.finalTotalTime = 0
        self.finalUsedRamPerc = 0
        self.logger = logger
    
    def start(self):
        self.startTime = time.time()
        tracemalloc.start()
    
    def getMetric(self, msg):
        
        self.endTime = time.time()
        
        self.totalTime = self.endTime - self.startTime
        
        
        current, peak = tracemalloc.get_traced_memory()
        current = current / 10**6
        peak = peak / 10**6
        
        usedRamPerc = psutil.virtual_memory()[2]
        
        self.finalPeak = max(self.finalPeak, peak)
        self.finalUsedRamPerc = max(self.finalUsedRamPerc, usedRamPerc)

        self.finalTotalTime = self.finalTotalTime + self.totalTime
        #self.finalTotalTime = round(self.finalTotalTime/60,3)
        
        outputString = f"{msg},{round(self.totalTime/60,4)} Min,{current} MB,{peak} MB,{usedRamPerc}%\n"
        
        self.logger.logFlowCheckpoint(f"{outputString}")
        
        print(f"Metrics : {outputString}")
        self.writer.write(outputString)
        tracemalloc.stop()
        tracemalloc.start()
        self.startTime = time.time()
    def end(self):
        
        current, peak = tracemalloc.get_traced_memory()
        current = current / 10**6
        outputString = f"Final Metrics,{round(self.finalTotalTime/60,4)} Min,{current} MB,{self.finalPeak} MB,{self.finalUsedRamPerc}%\n"
        print(f"Metrics : {outputString}")
        self.logger.logFlowCheckpoint(f"{outputString}")
        self.writer.write(outputString)
        self.writer.close()
        tracemalloc.stop()
        
        


def convertToInt(x):
    try:
        return str(int(x))
    except:
        return x


def convertCollectionToDataFrame(collection):

    dfExtractedHier = pd.DataFrame(collection)
    dfExtractedHier['parent_id'] = dfExtractedHier['parent_id'].apply(
        lambda x: convertToInt(x))
    dfExtractedHier['id'] = dfExtractedHier['id'].apply(
        lambda x: convertToInt(x))

    return dfExtractedHier

def getRandomString(N):
    str_ = ''.join(random.choice(string.ascii_uppercase + string.digits
                                 + string.ascii_lowercase) for _ in range(N))
    return str_


def convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog):

    module_path = os.path.join(basePath)

    if "/" in basePath:
        pathSep = "/"
    else:
        pathSep = "\\"
    
    # Generate output folder path
    output_json_path = os.path.join(basePath, 'outputJSON')

    """
        Check if input folder exists, else throw exception
    """
    if(os.path.exists(module_path)):
        filenames = glob.glob(os.path.join(module_path, htmlDocName))

        # Create language specific folder in outputJSON folder if it doesn't exist
        if(not os.path.exists(output_json_path)):
            os.mkdir(output_json_path)
        logger = MatchLogger(f'Parser_{getRandomString(1)}', htmlDocName,
                             domain, procedureType, languageCode, "HTML", fileNameLog)

        styleLogger = MatchLogger(
            f'Style Dictionary_{getRandomString(1)}', htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)

        styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                             controlBasePath=controlBasePath,
                                             language=languageCode,
                                             fileName=fileNameQrd,
                                             domain=domain,
                                             procedureType=procedureType
                                             )

        parserObj = parserExtractor(config, logger, styleRulesObj.styleRuleDict,
                                    styleRulesObj.styleFeatureKeyList,
                                    styleRulesObj.qrd_section_headings)

        for input_filename in filenames:
          # if(input_filename.find('Kalydeco II-86-PI-clean')!=-1):
            output_filename = os.path.join(output_json_path, htmlDocName)
            style_filepath =  output_filename.replace('.html','.txt')
            style_filepath =  style_filepath.replace('.txtl','.txt')
            style_filepath =  style_filepath.replace('.htm','.txt')
            print("-------------",style_filepath,"-----------------")

            output_filename = output_filename.replace('.html', '.json')
            output_filename = output_filename.replace('.htm', '.json')
            print(input_filename, output_filename)
            parserObj.createPIJsonFromHTML(input_filepath=input_filename,
                                           output_filepath=output_filename,
                                           style_filepath = style_filepath,
                                           img_base64_dict=parserObj.convertImgToBase64(input_filename)
                                           )
            
        return output_filename.split(pathSep)[-1], style_filepath
    else:
        try:    
            raise FolderNotFoundError(module_path + " not found")
        except:  
            logger.logFlowCheckpoint("Folder For Language Code Not Found In Input File")
            logger.logException("Folder For Language Code Not Found In Input File")
        raise FolderNotFoundError(module_path + " not found")
        return None


def splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog):

    styleLogger = MatchLogger(
        f'Style Dictionary_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                        controlBasePath=controlBasePath,
                                        language=languageCode,
                                        fileName=fileNameQrd,
                                        domain=domain,
                                        procedureType=procedureType
                                        )
    
    path_json = os.path.join(basePath,'outputJSON', fileNameJson)
    print("PathJson",path_json)
    partitionLogger = MatchLogger(
        f'Partition_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    partitioner = DocTypePartitioner(partitionLogger)

    partitionedJsonPaths = partitioner.partitionHtmls(
        styleRulesObj.qrd_section_headings, path_json)

    return partitionedJsonPaths


def extractAndValidateHeadings(controlBasePath,
                                basePath,
                                domain,
                                procedureType,
                                languageCode,
                                documentNumber,
                                fileNameDoc,
                                fileNameQrd,
                                fileNameMatchRuleBook,
                                fileNameDocumentTypeNames,
                                fileNameLog,
                                stopWordFilterLen=6,
                                isPackageLeaflet=False,
                                medName=None
                                ):

    if documentNumber == 0:
        topHeadingsConsidered = 4
        bottomHeadingsConsidered = 6
    elif documentNumber == 1:
        topHeadingsConsidered = 3
        bottomHeadingsConsidered = 5
    elif documentNumber == 2:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 15
    else:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 10

    print(f"Starting Heading Extraction For File :- {fileNameDoc}")
    logger = MatchLogger(f"Heading Extraction {fileNameDoc}_{getRandomString(1)}", fileNameDoc, domain, procedureType, languageCode, documentNumber, fileNameLog)
    logger.logFlowCheckpoint("Starting Heading Extraction")

    stopWordlanguage = DocumentTypeNames(
        controlBasePath=controlBasePath,
        fileNameDocumentTypeNames=fileNameDocumentTypeNames,
        languageCode=languageCode,
        domain=domain,
        procedureType=procedureType,
        documentNumber=documentNumber
        ).extractStopWordLanguage()

    matchDocObj = MatchDocument(
        logger,
        controlBasePath,
        basePath,
        domain,
        procedureType,
        languageCode,
        documentNumber,
        fileNameDoc,
        fileNameQrd,
        fileNameMatchRuleBook,
        fileNameDocumentTypeNames,
        topHeadingsConsidered,
        bottomHeadingsConsidered,
        stopWordFilterLen,
        stopWordlanguage,
        isPackageLeaflet,
        medName)
    df, coll, documentType = matchDocObj.matchHtmlHeaddingsWithQrd()

    return df, coll, documentType


def parseDocument(controlBasePath, basePath ,htmlDocName, fileNameQrd, fileNameMatchRuleBook, fileNameDocumentTypeNames, medName = None):
    
    
    if "/" in basePath:
        pathSep = "/"        
    else:
        pathSep = "\\"
    
    fileNameLog = os.path.join(basePath,'FinalLog.txt')

    pathComponents = basePath.split(pathSep)
    print(pathComponents, htmlDocName)
    timestamp = pathComponents[-1]
    languageCode =  pathComponents[-2]
    medName = pathComponents[-3]
    procedureType = pathComponents[-4]
    domain = pathComponents[-5]

    print(timestamp, languageCode, medName, procedureType, domain)
        
    flowLogger =  MatchLogger(f"Flow Logger HTML_{getRandomString(1)}", htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)
    
    metrics = Metrics(os.path.join(basePath,'Metrics.csv'),flowLogger)
    
    
    flowLogger.logFlowCheckpoint("Starting HTML Conversion To Json")
    ###Convert Html to Json
    fileNameJson, stylesFilePath = convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog)
    
    print("stylePath:-",stylesFilePath)
    flowLogger.logFlowCheckpoint("Completed HTML Conversion To Json")
    metrics.getMetric("HTML Conversion To Json")

    flowLogger.logFlowCheckpoint("Starting Json Split")

    ###Split Uber Json to multiple Jsons for each category.
    partitionedJsonPaths = splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog)
    
    partitionedJsonPaths = [ path.split(pathSep)[-1] for path in partitionedJsonPaths]
    flowLogger.logFlowCheckpoint(str(partitionedJsonPaths))
    
    flowLogger.logFlowCheckpoint("Completed Json Split")
    metrics.getMetric("Split Json")
    
    flowLogger.logFlowCheckpoint("Started Processing Partitioned Jsons")
    
    for index, fileNamePartitioned in enumerate(partitionedJsonPaths):
        print("Index", index)
        if index in [0,1,2]:
            continue
        flowLogger.logFlowCheckpoint(f"\n\n\n\n||||||||||||||||||||||||||||||||{str(index)} ||||| {str(fileNamePartitioned)}||||||||||||||||||||||||||||||||\n\n\n\n")
        
        if index == 3:
            stopWordFilterLen = 100
            isPackageLeaflet = True
        else:
            stopWordFilterLen = 6
            isPackageLeaflet = False
            
        df, coll, documentType = extractAndValidateHeadings(controlBasePath,
                                    basePath,
                                    domain,
                                    procedureType,
                                    languageCode,
                                    index,
                                    fileNamePartitioned,
                                    fileNameQrd,
                                    fileNameMatchRuleBook,
                                    fileNameDocumentTypeNames,
                                    fileNameLog,
                                    stopWordFilterLen=stopWordFilterLen,
                                    isPackageLeaflet=isPackageLeaflet,
                                    medName=medName)
        
        
        print(f"Completed Heading Extraction For File")
        flowLogger.logFlowCheckpoint("Completed Heading Extraction For File")
        metrics.getMetric(f"{index}: Heading Extraction")

        print(f"Starting Document Annotation For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Document Annotation For File")
        documentAnnotationObj = DocumentAnnotation(fileNamePartitioned,'c20835db4b1b4e108828a8537ff41506','https://spor-sit.azure-api.net/pms/api/v2/',df,coll)
        try:
            pms_oms_annotation_data = documentAnnotationObj.processRegulatedAuthorizationForDoc()
            print(pms_oms_annotation_data)
        except:
            pms_oms_annotation_data = None
            print("Error Found")
            
        print(f"Completed Document Annotation")        
        flowLogger.logFlowCheckpoint("Completed Document Annotation")
        metrics.getMetric(f"{index}: Document Annotation")
        
        print(f"Starting Extracting Content Between Heading For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Extracting Content Between Heading")
        
        extractContentlogger =  MatchLogger(f'ExtractContentBetween_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        extractorObj = DataBetweenHeadingsExtractor(extractContentlogger, basePath, coll)
        dfExtractedHierRR = extractorObj.extractContentBetweenHeadings(fileNamePartitioned)
        
        print(f"Completed Extracting Content Between Heading")        
        flowLogger.logFlowCheckpoint("Completed Extracting Content Between Heading")
        metrics.getMetric(f"{index}: Content Extraction")
        
        xmlLogger =  MatchLogger(f'XmlGeneration_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        fhirXmlGeneratorObj = FhirXmlGenerator(xmlLogger, controlBasePath, basePath, pms_oms_annotation_data, stylesFilePath, medName)
        fileNameXml = fileNamePartitioned.replace('.json','.xml')
        generatedXml = fhirXmlGeneratorObj.generateXml(dfExtractedHierRR, fileNameXml)
        
        metrics.getMetric(f"{index}: Generate XML")
        
        fhirServiceLogger =  MatchLogger(f'XML Submission Logger_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)

        fhirServiceObj = FhirService(fhirServiceLogger, basePath, generatedXml)
        fhirServiceObj.submitFhirXml()
        
        metrics.getMetric(f"{index}: Submit FHIR Msg")
        
        print(f"Created XML File For :- {fileNamePartitioned}")      
        
        return df,coll,dfExtractedHierRR
    
    
    flowLogger.logFlowCheckpoint("Completed Processing Partitioned Jsons")
    metrics.getMetric(f"{index}: Completed")
    metrics.end()

In [3]:
from wordToHtmlConvertor.wordToHtmlConvertor import WordToHtmlConvertor

wordToHtmlConvertorObj = WordToHtmlConvertor()
wordToHtmlConvertorObj.convertWordToHTML()

2021-05-19 00:50:50,658 : WordToHtmlLogger_M : Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:50:50,659 : WordToHtmlLogger_M : Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\ABASAGLAR\en\2021-05-18T19-20-50Z\ABASAGLAR_clean | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx


Word Files in folder:  ['ABASAGLAR~H~CAP~en.docx', 'Abilify Maintena~H~CAP~en.doc', 'ABILIFY~H~CAP~en.doc', 'Adakveo~H~CAP~en.docx', 'Adcetris~H~CAP~en.doc', '~$ilify Maintena~H~CAP~en.doc']
Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx
Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\ABASAGLAR\en\2021-05-18T19-20-50Z\ABASAGLAR_clean


2021-05-19 00:51:09,654 : WordToHtmlLogger_M : Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx


Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx


2021-05-19 00:51:11,212 : WordToHtmlLogger_M : Starting document cleaning process | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx



Checking table 20
The selection starts on page 105 of 106 (69.44999694824219/70.75)
The selection ends on page 105 of 106 (384.8500061035156/70.75)
The selection contains
* overlay images

Checking table 19
The selection starts on page 103 of 106 (579.2000122070312/70.75)
The selection ends on page 105 of 106 (56.70000076293945/70.75)
The selection contains
* overlay images

Checking table 18
The selection starts on page 102 of 105 (564.5499877929688/214.75)
The selection ends on page 103 of 105 (421.79998779296875/70.75)
The selection contains
* overlay images
* overlay shapes

Checking table 17
The selection starts on page 102 of 105 (56.70000076293945/70.75)
The selection ends on page 102 of 105 (475.95001220703125/70.75)
The selection contains
* overlay images

Checking table 16
The selection starts on page 100 of 105 (577.9000244140625/70.75)
The selection ends on page 101 of 105 (484.25/70.75)
The selection contains
* overlay images

Checking table 15
The selection starts on pag

2021-05-19 00:51:51,072 : WordToHtmlLogger_M : Completed document cleaning process | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:51:51,078 : WordToHtmlLogger_M : Preparing zip file | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:51:52,742 : WordToHtmlLogger_M : Zip file created: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zip | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:51:52,746 : WordToHtmlLogger_M : Uploading to Azure Storage as blob:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zip | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx



Uploading File to  Azure Storage:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zip


2021-05-19 00:52:05,189 : WordToHtmlLogger_M : Uploaded F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zipsuccessfully | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:52:05,190 : WordToHtmlLogger_M : Deleting input word file: ABASAGLAR~H~CAP~en.docx | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:52:05,217 : WordToHtmlLogger_2 : Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:52:05,218 : WordToHtmlLogger_2 : Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Abilify Maintena\en\2021-05-18T19-22-05Z\Abilify Maintena_clean | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc


Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc
Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Abilify Maintena\en\2021-05-18T19-22-05Z\Abilify Maintena_clean


2021-05-19 00:54:18,892 : WordToHtmlLogger_2 : Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc


Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc


2021-05-19 00:54:20,499 : WordToHtmlLogger_2 : Starting document cleaning process | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc



Checking table 23
The selection starts on page 95 of 95 (619.3499755859375/70.75)
The selection ends on page 95 of 95 (737.2000122070312/70.75)
The selection contains
* inline images

Checking table 22
The selection starts on page 95 of 95 (303.29998779296875/70.75)
The selection ends on page 95 of 95 (480.3999938964844/70.75)
The selection contains
* inline images

Checking table 21
The selection starts on page 95 of 95 (107.05000305175781/70.75)
The selection ends on page 95 of 95 (227.4499969482422/70.75)
The selection contains
* inline images

Checking table 20
The selection starts on page 94 of 95 (653.0999755859375/70.75)
The selection ends on page 95 of 95 (56.70000076293945/70.75)
The selection contains
* inline images

Checking table 19
The selection starts on page 94 of 95 (494.45001220703125/70.75)
The selection ends on page 94 of 95 (627.5999755859375/70.75)
The selection contains
* inline images

Checking table 18
The selection starts on page 94 of 95 (353.0/70.75)
The se

2021-05-19 00:54:46,529 : WordToHtmlLogger_2 : Completed document cleaning process | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:46,532 : WordToHtmlLogger_2 : Preparing zip file | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:47,821 : WordToHtmlLogger_2 : Zip file created: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zip | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:47,825 : WordToHtmlLogger_2 : Uploading to Azure Storage as blob:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zip | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc



Uploading File to  Azure Storage:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zip


2021-05-19 00:54:59,563 : WordToHtmlLogger_2 : Uploaded F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zipsuccessfully | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:59,564 : WordToHtmlLogger_2 : Deleting input word file: Abilify Maintena~H~CAP~en.doc | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:59,573 : WordToHtmlLogger_2 : Killing Word processes as exception was raised | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc


Exception raised
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE


NoSuchProcess: psutil.NoSuchProcess process no longer exists (pid=34304)

In [86]:
# inputZipFolderPath = "F:\Projects\EMA\Repository\EMA EPI PoC\\function_code\\inputblob"
inputZipFolderPath = os.path.abspath(os.path.join('..'))
inputZipFolderPath = os.path.join(inputZipFolderPath, 'inputblob')
inputZipFileName = "Adakveo~H~CAP~en~2021-05-12T12-47-53Z.zip"

In [87]:
fileNameQrd = 'qrd_canonical_model.csv'
fileNameMatchRuleBook = 'ruleDict.json'
fileNameDocumentTypeNames = 'documentTypeNames.json'
fsMountName = '/mounted'

info = inputZipFileName.split("~")

try:
    medName = info[0]
    domain = info[1]
    procedureType = info[2]
    languageCode = info[3]
    timestamp = info[4]
    timestamp = timestamp.replace(".zip","")

except Exception:
    raise f"Missing required info in the zip file name {inputZipFileName}"

if "\\" in os.getcwd():
    localEnv = True
    inputZipFolderPath = os.path.join(os.path.abspath(os.path.join('..')),inputZipFolderPath)
    outputFolderPath = os.path.join(os.path.abspath(os.path.join('..')), 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(os.path.abspath(os.path.join('..')),'control')
else:
    localEnv = False
    inputZipFolderPath = os.path.join(f'{fsMountName}',inputZipFolderPath)
    outputFolderPath = os.path.join(f'{fsMountName}', 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(f'{fsMountName}','control')


print(inputZipFileName, inputZipFolderPath, outputFolderPath, controlFolderPath)

mode = 0o666

if localEnv is True:
    inputZipFolderPath = inputZipFolderPath.replace("/","\\")
    outputFolderPath = outputFolderPath.replace("/","\\")
    controlFolderPath = controlFolderPath.replace("/","\\")

try:
    os.makedirs(inputZipFolderPath, mode)
    os.makedirs(outputFolderPath, mode)
    os.makedirs(controlFolderPath, mode)

except Exception:
    print("Already Present")
    
with zipfile.ZipFile(f'{inputZipFolderPath}/{inputZipFileName}',"r") as zip_ref:
        zip_ref.extractall(outputFolderPath)
    

_,_,fileNames = next(os.walk(outputFolderPath))
htmlFileName = [fileName for fileName in fileNames if ".htm" in fileName][0]

print(htmlFileName)



Adakveo~H~CAP~en~2021-05-12T12-47-53Z.zip F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z F:\Projects\EMA\Repository\EMA EPI PoC\function_code\control
Already Present
Adakveo_clean.htm


In [88]:
parseDocument(controlFolderPath, outputFolderPath, htmlFileName, fileNameQrd, fileNameMatchRuleBook, fileNameDocumentTypeNames, medName)

2021-05-20 23:15:13,269 : Flow Logger HTML_K : Starting HTML Conversion To Json | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:13,269 : Flow Logger HTML_K : Starting HTML Conversion To Json | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:13,281 : Style Dictionary_o : Reading style dictionary in file: rule_dictionary_en.json | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:13,332 : Style Dictionary_o : Qrd Section Keys Retrieved For Style Dictionary: ANNEX I, ANNEX II, ANNEX III, B. PACKAGE LEAFLET | H | CAP |  en | HTML | Adakveo_clean.htm


['F:', 'Projects', 'EMA', 'Repository', 'EMA EPI PoC', 'function_code', 'work', 'H', 'CAP', 'Adakveo', 'en', '2021-05-12T12-47-53Z'] Adakveo_clean.htm
2021-05-12T12-47-53Z en Adakveo CAP H
------------- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\outputJSON\Adakveo_clean.txt -----------------
F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\Adakveo_clean.htm F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\outputJSON\Adakveo_clean.json


2021-05-20 23:15:13,887 : Parser_l : Style Information Stored In File: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\outputJSON\Adakveo_clean.txt | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:17,411 : Parser_l : Writing to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\outputJSON\Adakveo_clean.json | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:17,544 : Flow Logger HTML_K : Completed HTML Conversion To Json | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:17,544 : Flow Logger HTML_K : Completed HTML Conversion To Json | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:17,547 : Flow Logger HTML_K : HTML Conversion To Json,0.0713 Min,5.493739 MB,8.243986 MB,73.2%
 | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:15:17,547 : Flow Logger HTML_K : HTML Conversion To Json,0.0713 Min,5.493739 MB,8.243986 MB,73.2%
 | H | CAP |  en | HTM

stylePath:- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\outputJSON\Adakveo_clean.txt
Metrics : HTML Conversion To Json,0.0713 Min,5.493739 MB,8.243986 MB,73.2%

PathJson F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\outputJSON\Adakveo_clean.json


2021-05-20 23:15:18,224 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\partitionedJSONs\Adakveo_clean_SmPC.json | H | CAP |  en | Json | Adakveo_clean.json
2021-05-20 23:15:18,224 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\partitionedJSONs\Adakveo_clean_SmPC.json | H | CAP |  en | Json | Adakveo_clean.json
2021-05-20 23:15:18,286 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\partitionedJSONs\Adakveo_clean_ANNEX II.json | H | CAP |  en | Json | Adakveo_clean.json
2021-05-20 23:15:18,286 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\partitionedJSONs\Adakveo_clean_ANNEX II.json | H | CAP |  en | Json | Adakveo_clean.json


Metrics : Split Json,0.0137 Min,0.294985 MB,4.908939 MB,73.3%

Index 0
Index 1
Index 2
Index 3
Starting Heading Extraction For File :- Adakveo_clean_ PACKAGE LEAFLET.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Adakveo\en\2021-05-12T12-47-53Z\partitionedJSONs\Adakveo_clean_ PACKAGE LEAFLET.json
--------------------------------------------
Package leaflet


2021-05-20 23:15:18,855 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Started Extracting Heading | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json
2021-05-20 23:15:18,873 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed : <=4|16.67|(91, 100, 95)|0.913| | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'B. PACKAGE LEAFLET' | Qrd txt :- 'PACKAGE LEAFLET' | Matched :- 'True'
2021-05-20 23:15:18,881 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed As This The First Heading | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23001' | prevHeadingCurrId :- '' | prevHeadingFoundId :- ''
2021-05-20 23:15:19,695 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'This medicine is subject to additional monitoring. This will allow quick identification of new safety information. You can

----------------------------------
RemovedByStyle
----------------------------------


2021-05-20 23:15:22,983 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'What is in this leaflet' | Qrd txt :- 'What is in this leaflet' | Matched :- 'True'
2021-05-20 23:15:22,994 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23003' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23001'
2021-05-20 23:15:23,001 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23003' | prevHeadingCurrId :- '23001' | prevHeadingFoundId :- '23001'
2021-05-20 23:15:23,045 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- '1. What Adakveo is and what it is used for' | Qrd txt :- '1. What Adakveo is 

----------------------------------
RemovedByStyle
----------------------------------


 you need to know before you are given Adakveo' | Qrd txt :- '2. What you need to know before you <take> <use> Adakveo ' | Matched :- 'True' | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- '2.       What
2021-05-20 23:15:23,400 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23005' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:23,410 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23005' | prevHeadingCurrId :- '23003' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:23,423 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Failed By Style | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23005' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23003'


----------------------------------
RemovedByStyle
----------------------------------


2021-05-20 23:15:24,036 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- '4. Possible side effects' | Qrd txt :- '4. Possible side effects' | Matched :- 'True'
2021-05-20 23:15:24,049 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23019' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:24,060 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23019' | prevHeadingCurrId :- '23003' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:24,067 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Failed By Style | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23019' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '

----------------------------------
RemovedByStyle
----------------------------------


2021-05-20 23:15:24,278 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- '5. How to store Adakveo' | Qrd txt :- '5. How to store Adakveo' | Matched :- 'True'
2021-05-20 23:15:24,292 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23022' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:24,300 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23022' | prevHeadingCurrId :- '23003' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:24,308 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Failed By Style | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23022' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23

----------------------------------
RemovedByStyle
----------------------------------


2021-05-20 23:15:24,596 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- '6. Contents of the pack and other information' | Qrd txt :- '6. Contents of the pack and other information' | Matched :- 'True'
2021-05-20 23:15:24,611 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23023' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:24,618 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23023' | prevHeadingCurrId :- '23003' | prevHeadingFoundId :- '23003'
2021-05-20 23:15:24,625 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Failed By Style | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23023' | prevHea

----------------------------------
RemovedByStyle
----------------------------------


 called monoclonal antibodies (mAbs).' | Qrd txt :- 'Adakveo contains {name the excipient(s)}' | Matched :- 'True'100, 100)|1.0| | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'Adakveo contains
2021-05-20 23:15:25,263 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23013' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23004'
2021-05-20 23:15:25,272 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23013' | prevHeadingCurrId :- '23004' | prevHeadingFoundId :- '23004'
2021-05-20 23:15:25,279 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Failed By Style | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23013' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23004'


----------------------------------
RemovedByStyle
----------------------------------


2021-05-20 23:15:28,776 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Match Passed : Contains<>|15.09|(88, 85, 88)|0.968| | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- '2.       What you need to know before you are given Adakveo' | Qrd txt :- '2. What you need to know before you <take> <use> Adakveo ' | Matched :- 'True'
2021-05-20 23:15:28,792 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23005' | prevHeadingCurrId :- '23004' | prevHeadingFoundId :- '23004'
2021-05-20 23:15:29,776 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23007' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23005'
2021-05-20 23:15:29,783 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean

2021-05-20 23:16:04,497 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Flow Is Broken | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23027' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '23025'
2021-05-20 23:16:04,508 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23027' | prevHeadingCurrId :- '23025' | prevHeadingFoundId :- '23025'
 was last revised in' | Qrd txt :- 'This leaflet was last revised in <{MM/YYYY}><{month YYYY}>.' | Matched :- 'True'38|(70, 100, 90)|0.938| | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | Doc txt :- 'This leaflet
2021-05-20 23:16:04,759 : Heading Extraction Adakveo_clean_ PACKAGE LEAFLET.json_0 : Validation Passed | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json | currHeadId :- '23028' | prevHeadingCurrId :- '23027' | prevHeadingFoundId :- '23027'
2021-05-20 23:16:06,546 : He



Heading Not Found 
 ['q This medicine is subject to additional monitoring. This will allow quick identification of new safety information. You can help by reporting any side effects you may get. See the end of section 4 for how to report side effects.', 'Do not <take> <use> X', 'X with <food> <and> <,> <drink> <and> <alcohol>', 'How to <take> <use> X ', 'Use in children <and adolescents>', 'If you <take> <use> more X than you should', 'If you forget to <take> <use> X>', 'If you stop <taking> <using> X>', 'Additional side effects in children <and adolescents>', 'Marketing Authorisation Holder and Manufacturer']


dict_keys(['q This medicine is subject to additional monitoring. This will allow quick identification of new safety information. You can help by reporting any side effects you may get. See the end of section 4 for how to report side effects.', '1. What Adakveo is and what it is used for', '2. What you need to know before you <take> <use> Adakveo ', '4. Possible side effects',

2021-05-20 23:16:24,083 : XmlGeneration_3_3 : Writing to File:Adakveo_clean_ PACKAGE LEAFLET.xml | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json
2021-05-20 23:16:24,086 : Flow Logger HTML_K : 3: Generate XML,0.0036 Min,1.498816 MB,2.045456 MB,72.9%
 | H | CAP |  en | HTML | Adakveo_clean.htm
2021-05-20 23:16:24,086 : Flow Logger HTML_K : 3: Generate XML,0.0036 Min,1.498816 MB,2.045456 MB,72.9%
 | H | CAP |  en | HTML | Adakveo_clean.htm


Metrics : 3: Generate XML,0.0036 Min,1.498816 MB,2.045456 MB,72.9%



2021-05-20 23:16:25,738 : XML Submission Logger_3_W : Initiating Submission To FHIR Server | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json
2021-05-20 23:16:25,739 : XML Submission Logger_3_W : Response{"resourceType":"Bundle","id":"bf2051e5-4010-4001-8a4d-513955ad20f1","meta":{"versionId":"1","lastUpdated":"2021-05-20T17:46:24.618+00:00"},"type":"collection","entry":[{"fullUrl":"urn:uuid:9854550e-3842-4054-879a-4513c0e718ea","resource":{"resourceType":"Bundle","id":"47dc342f-423f-4518-8e59-646214107035","identifier":{"system":"http://ema.europa.eu/fhir/identifier/documentid","value":"${instance.bundle[n].Identifier}"},"type":"document","timestamp":"2021-05-20T17:46:23+00:00","entry":[{"fullUr | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json
2021-05-20 23:16:25,742 : XML Submission Logger_3_W : POST sucessful: XML added with id: bf2051e5-4010-4001-8a4d-513955ad20f1 | H | CAP |  en | 3 | Adakveo_clean_ PACKAGE LEAFLET.json
2021-05-20 23:16:25,743 : Flow Logger HTML_K :

POST sucessful: XML added with id bf2051e5-4010-4001-8a4d-513955ad20f1
Metrics : 3: Submit FHIR Msg,0.0275 Min,0.057005 MB,0.554936 MB,72.9%

Created XML File For :- Adakveo_clean_ PACKAGE LEAFLET.json


(      Bold               Classes  \
 0    False                  None   
 1    False         ['MsoNormal']   
 2    False         ['MsoNormal']   
 3    False         ['MsoNormal']   
 4    False         ['MsoNormal']   
 5    False         ['MsoNormal']   
 6    False         ['MsoNormal']   
 7    False         ['MsoNormal']   
 8    False         ['MsoNormal']   
 9    False         ['MsoNormal']   
 10   False         ['MsoNormal']   
 11   False         ['MsoNormal']   
 12   False         ['MsoNormal']   
 13   False         ['MsoNormal']   
 14   False         ['MsoNormal']   
 15   False         ['MsoNormal']   
 16   False         ['MsoNormal']   
 17   False         ['MsoNormal']   
 18   False         ['MsoNormal']   
 19   False         ['MsoNormal']   
 20   False         ['MsoNormal']   
 21   False         ['MsoNormal']   
 22   False         ['MsoNormal']   
 23   False         ['MsoNormal']   
 24   False         ['MsoNormal']   
 25    True         ['MsoNormal']   
 

In [20]:
a

Unnamed: 0,Bold,Classes,Element,HasBorder,ID,Indexed,IsHeadingType,IsListItem,IsPossibleHeading,Italics,ParentId,Styles,Text,Underlined,Uppercased,StringLength
0,True,['MsoNormal'],"<p align=""center"" class=""MsoNormal"" style=""margin-top:0in;margin-right:-.1pt;\r margin-bottom:0in;margin-left:27.0pt;text-align:center;text-indent:-27.0pt;\r line-height:normal""><b><span lang=""EN-...",False,320efb82-a156-45a3-afdd-0e3a7e54d6da,False,,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:-.1pt;\r\nmargin-bottom:0in;margin-left:27.0pt;text-align:center;text-indent:-27.0pt;\r\nline-height:normal,ANNEX\r II,False,True,8
1,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:-.1pt;margin-bottom:0in;\r margin-left:0in;line-height:normal""></p>",False,f79ed8ed-bfc4-4e8b-bd49-ede302291e8a,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:-.1pt;margin-bottom:0in;\r\nmargin-left:0in;line-height:normal,,False,False,0
2,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,7b2ff4b3-f27a-47ca-96a9-7ae24cfdefab,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,A. MANUFACTURER(S) OF\r THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND MANUFACTURER(S) RESPONSIBLE FOR BATCH\r RELEASE,False,True,106
3,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,d320b69e-a3c5-4764-aa31-251a32af8a17,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0
4,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,e67b4687-f1a0-4608-988a-bcbb07029721,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,B. CONDITIONS OR\r RESTRICTIONS REGARDING SUPPLY AND USE,False,True,54
5,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,a9013513-966a-4638-818e-90def54e0698,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0
6,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,f9007321-3055-4120-b8ed-79e72cb470ae,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,C. OTHER CONDITIONS AND\r REQUIREMENTS OF THE MARKETING AUTHORISATION,False,True,67
7,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,1b70d864-ef01-4bad-91f3-54dcbbe56cef,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0
8,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,71e9cc22-be86-4f43-b932-f89b799916ba,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,D. CONDITIONS OR\r RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT,False,True,96
9,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,d7fd1fa1-ea7c-4b20-8b1b-11cbcafde70c,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0


In [69]:
convertCollectionToDataFrame(b)

NameError: name 'b' is not defined