In [1]:
import os
import zipfile
%load_ext autoreload

%autoreload 2

In [8]:
import tracemalloc
import psutil
import pprint
import pandas as pd
import uuid
import json
import os
import glob
import re
import sys
from bs4 import NavigableString, BeautifulSoup
from collections import defaultdict
import random
import string
import time

from utils.config import config
from utils.logger.logger import loggerCreator

# ePI Modules
from parse.rulebook.rulebook import StyleRulesDictionary

from parse.extractor.parser import parserExtractor
from match.matchDocument.matchDocument import MatchDocument
from documentAnnotation.documentAnnotation import DocumentAnnotation
from htmlDocTypePartitioner.partition import DocTypePartitioner
from extractContentBetweenHeadings.dataBetweenHeadingsExtractor import DataBetweenHeadingsExtractor
from fhirXmlGenerator.fhirXmlGenerator import FhirXmlGenerator
from fhirService.fhirService import FhirService
from utils.logger.matchLogger import MatchLogger
from languageInfo.documentTypeNames.documentTypeNames import DocumentTypeNames
from listBundle.addAndUpdateListBundle.addAndUpdateListBundle import ListBundleHandler

class FolderNotFoundError(Exception):
    pass

class Metrics:
    
    def __init__(self, logFileName, logger):
        self.logFileName = logFileName
        self.start()
        self.writer = open(self.logFileName, 'a')
        self.writer.write("StepName,Time,Current Memory,Peak Memory,Used Ram Percentage\n")
        self.finalPeak = 0
        self.finalTotalTime = 0
        self.finalUsedRamPerc = 0
        self.logger = logger
    
    def start(self):
        self.startTime = time.time()
        tracemalloc.start()
    
    def getMetric(self, msg):
        
        self.endTime = time.time()
        
        self.totalTime = self.endTime - self.startTime
        
        
        current, peak = tracemalloc.get_traced_memory()
        current = current / 10**6
        peak = peak / 10**6
        
        usedRamPerc = psutil.virtual_memory()[2]
        
        self.finalPeak = max(self.finalPeak, peak)
        self.finalUsedRamPerc = max(self.finalUsedRamPerc, usedRamPerc)

        self.finalTotalTime = self.finalTotalTime + self.totalTime
        #self.finalTotalTime = round(self.finalTotalTime/60,3)
        
        outputString = f"{msg},{round(self.totalTime/60,4)} Min,{current} MB,{peak} MB,{usedRamPerc}%\n"
        
        self.logger.logFlowCheckpoint(f"{outputString}")
        
        print(f"Metrics : {outputString}")
        self.writer.write(outputString)
        tracemalloc.stop()
        tracemalloc.start()
        self.startTime = time.time()
    def end(self):
        
        current, peak = tracemalloc.get_traced_memory()
        current = current / 10**6
        outputString = f"Final Metrics,{round(self.finalTotalTime/60,4)} Min,{current} MB,{self.finalPeak} MB,{self.finalUsedRamPerc}%\n"
        print(f"Metrics : {outputString}")
        self.logger.logFlowCheckpoint(f"{outputString}")
        self.writer.write(outputString)
        self.writer.close()
        tracemalloc.stop()
        
        


def convertToInt(x):
    try:
        return str(int(x))
    except:
        return x


def convertCollectionToDataFrame(collection):

    dfExtractedHier = pd.DataFrame(collection)
    dfExtractedHier['parent_id'] = dfExtractedHier['parent_id'].apply(
        lambda x: convertToInt(x))
    dfExtractedHier['id'] = dfExtractedHier['id'].apply(
        lambda x: convertToInt(x))

    return dfExtractedHier

def getRandomString(N):
    str_ = ''.join(random.choice(string.ascii_uppercase + string.digits
                                 + string.ascii_lowercase) for _ in range(N))
    return str_


def convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog):

    module_path = os.path.join(basePath)

    if "/" in basePath:
        pathSep = "/"
    else:
        pathSep = "\\"
    
    # Generate output folder path
    output_json_path = os.path.join(basePath, 'outputJSON')

    """
        Check if input folder exists, else throw exception
    """
    if(os.path.exists(module_path)):
        filenames = glob.glob(os.path.join(module_path, htmlDocName))

        # Create language specific folder in outputJSON folder if it doesn't exist
        if(not os.path.exists(output_json_path)):
            os.mkdir(output_json_path)
        logger = MatchLogger(f'Parser_{getRandomString(1)}', htmlDocName,
                             domain, procedureType, languageCode, "HTML", fileNameLog)

        styleLogger = MatchLogger(
            f'Style Dictionary_{getRandomString(1)}', htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)

        styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                             controlBasePath=controlBasePath,
                                             language=languageCode,
                                             fileName=fileNameQrd,
                                             domain=domain,
                                             procedureType=procedureType
                                             )

        parserObj = parserExtractor(config, logger, styleRulesObj.styleRuleDict,
                                    styleRulesObj.styleFeatureKeyList,
                                    styleRulesObj.qrd_section_headings)

        for input_filename in filenames:
          # if(input_filename.find('Kalydeco II-86-PI-clean')!=-1):
            output_filename = os.path.join(output_json_path, htmlDocName)
            style_filepath =  output_filename.replace('.html','.txt')
            style_filepath =  style_filepath.replace('.txtl','.txt')
            style_filepath =  style_filepath.replace('.htm','.txt')
            print("-------------",style_filepath,"-----------------")

            output_filename = output_filename.replace('.html', '.json')
            output_filename = output_filename.replace('.htm', '.json')
            print(input_filename, output_filename)
            parserObj.createPIJsonFromHTML(input_filepath=input_filename,
                                           output_filepath=output_filename,
                                           style_filepath = style_filepath,
                                           img_base64_dict=parserObj.convertImgToBase64(input_filename)
                                           )
            
        return output_filename.split(pathSep)[-1], style_filepath
    else:
        try:    
            raise FolderNotFoundError(module_path + " not found")
        except:  
            logger.logFlowCheckpoint("Folder For Language Code Not Found In Input File")
            logger.logException("Folder For Language Code Not Found In Input File")
        raise FolderNotFoundError(module_path + " not found")
        return None


def splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog):

    styleLogger = MatchLogger(
        f'Style Dictionary_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                        controlBasePath=controlBasePath,
                                        language=languageCode,
                                        fileName=fileNameQrd,
                                        domain=domain,
                                        procedureType=procedureType
                                        )
    
    path_json = os.path.join(basePath,'outputJSON', fileNameJson)
    print("PathJson",path_json)
    partitionLogger = MatchLogger(
        f'Partition_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    partitioner = DocTypePartitioner(partitionLogger)

    partitionedJsonPaths = partitioner.partitionHtmls(
        styleRulesObj.qrd_section_headings, path_json)

    return partitionedJsonPaths


def extractAndValidateHeadings(controlBasePath,
                                basePath,
                                domain,
                                procedureType,
                                languageCode,
                                documentNumber,
                                fileNameDoc,
                                fileNameQrd,
                                fileNameMatchRuleBook,
                                fileNameDocumentTypeNames,
                                fileNameLog,
                                stopWordFilterLen=6,
                                isPackageLeaflet=False,
                                medName=None
                                ):

    if documentNumber == 0:
        topHeadingsConsidered = 4
        bottomHeadingsConsidered = 6
    elif documentNumber == 1:
        topHeadingsConsidered = 3
        bottomHeadingsConsidered = 5
    elif documentNumber == 2:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 15
    else:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 10

    print(f"Starting Heading Extraction For File :- {fileNameDoc}")
    logger = MatchLogger(f"Heading Extraction {fileNameDoc}_{getRandomString(1)}", fileNameDoc, domain, procedureType, languageCode, documentNumber, fileNameLog)
    logger.logFlowCheckpoint("Starting Heading Extraction")

    stopWordlanguage = DocumentTypeNames(
        controlBasePath=controlBasePath,
        fileNameDocumentTypeNames=fileNameDocumentTypeNames,
        languageCode=languageCode,
        domain=domain,
        procedureType=procedureType,
        documentNumber=documentNumber
        ).extractStopWordLanguage()

    matchDocObj = MatchDocument(
        logger,
        controlBasePath,
        basePath,
        domain,
        procedureType,
        languageCode,
        documentNumber,
        fileNameDoc,
        fileNameQrd,
        fileNameMatchRuleBook,
        fileNameDocumentTypeNames,
        topHeadingsConsidered,
        bottomHeadingsConsidered,
        stopWordFilterLen,
        stopWordlanguage,
        isPackageLeaflet,
        medName)
    df, coll, documentType = matchDocObj.matchHtmlHeaddingsWithQrd()

    return df, coll, documentType


def parseDocument(controlBasePath,
                  basePath,
                  htmlDocName,
                  fileNameQrd,
                  fileNameMatchRuleBook,
                  fileNameDocumentTypeNames,
                  jsonTempFileName,
                  listBundleDocumentTypeCodesFileName,
                  apiMmgtBaseUrl,
                  getListApiEndPointUrlSuffix,
                  addUpdateListApiEndPointUrlSuffix,
                  apiMmgtSubsKey,
                  addBundleApiEndPointUrlSuffix,
                  medName = None):
    
    listRegulatedAuthCodesAccrossePI = []
    
    if "/" in basePath:
        pathSep = "/"        
    else:
        pathSep = "\\"
    
    fileNameLog = os.path.join(basePath,'FinalLog.txt')

    pathComponents = basePath.split(pathSep)
    print(pathComponents, htmlDocName)
    timestamp = pathComponents[-1]
    languageCode =  pathComponents[-2]
    medName = pathComponents[-3]
    procedureType = pathComponents[-4]
    domain = pathComponents[-5]

    print(timestamp, languageCode, medName, procedureType, domain)
        
    flowLogger =  MatchLogger(f"Flow Logger HTML_{getRandomString(1)}", htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)
    
    metrics = Metrics(os.path.join(basePath,'Metrics.csv'),flowLogger)
    
    
    flowLogger.logFlowCheckpoint("Starting HTML Conversion To Json")
    ###Convert Html to Json
    fileNameJson, stylesFilePath = convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog)
    
    print("stylePath:-",stylesFilePath)
    flowLogger.logFlowCheckpoint("Completed HTML Conversion To Json")
    metrics.getMetric("HTML Conversion To Json")

    flowLogger.logFlowCheckpoint("Starting Json Split")

    ###Split Uber Json to multiple Jsons for each category.
    partitionedJsonPaths = splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog)
    
    partitionedJsonPaths = [ path.split(pathSep)[-1] for path in partitionedJsonPaths]
    flowLogger.logFlowCheckpoint(str(partitionedJsonPaths))
    
    flowLogger.logFlowCheckpoint("Completed Json Split")
    metrics.getMetric("Split Json")
    
    flowLogger.logFlowCheckpoint("Started Processing Partitioned Jsons")
    
    for index, fileNamePartitioned in enumerate(partitionedJsonPaths):
        #print("Index", index)
        if index in [0]:
            continue
        flowLogger.logFlowCheckpoint(f"\n\n\n\n||||||||||||||||||||||||||||||||{str(index)} ||||| {str(fileNamePartitioned)}||||||||||||||||||||||||||||||||\n\n\n\n")
        
        if index == 3:
            stopWordFilterLen = 100
            isPackageLeaflet = True
        else:
            stopWordFilterLen = 6
            isPackageLeaflet = False
            
        df, coll, documentType = extractAndValidateHeadings(controlBasePath,
                                    basePath,
                                    domain,
                                    procedureType,
                                    languageCode,
                                    index,
                                    fileNamePartitioned,
                                    fileNameQrd,
                                    fileNameMatchRuleBook,
                                    fileNameDocumentTypeNames,
                                    fileNameLog,
                                    stopWordFilterLen=stopWordFilterLen,
                                    isPackageLeaflet=isPackageLeaflet,
                                    medName=medName)
    
        
        print(f"Completed Heading Extraction For File")
        flowLogger.logFlowCheckpoint("Completed Heading Extraction For File")
        metrics.getMetric(f"{index}: Heading Extraction")

        print(f"Starting Document Annotation For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Document Annotation For File")
        documentAnnotationObj = DocumentAnnotation(fileNamePartitioned,'c20835db4b1b4e108828a8537ff41506','https://spor-sit.azure-api.net/pms/api/v2/',df,coll, index)
        try:
            pms_oms_annotation_data = documentAnnotationObj.processRegulatedAuthorizationForDoc()
            print(pms_oms_annotation_data)
        except Exception as e:
            pms_oms_annotation_data = None
            print("Error Found", str(e))
            
        print(f"Completed Document Annotation")        
        flowLogger.logFlowCheckpoint("Completed Document Annotation")
        metrics.getMetric(f"{index}: Document Annotation")
        
        print(f"Starting Extracting Content Between Heading For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Extracting Content Between Heading")
        
        extractContentlogger =  MatchLogger(f'ExtractContentBetween_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        extractorObj = DataBetweenHeadingsExtractor(extractContentlogger, basePath, coll)
        dfExtractedHierRR = extractorObj.extractContentBetweenHeadings(fileNamePartitioned)
        
        print(f"Completed Extracting Content Between Heading")        
        flowLogger.logFlowCheckpoint("Completed Extracting Content Between Heading")
        metrics.getMetric(f"{index}: Content Extraction")
        
        
        xmlLogger =  MatchLogger(f'XmlGeneration_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        fhirXmlGeneratorObj = FhirXmlGenerator(xmlLogger, controlBasePath, basePath, pms_oms_annotation_data, stylesFilePath, medName)
        fileNameXml = fileNamePartitioned.replace('.json','.xml')
        generatedXml = fhirXmlGeneratorObj.generateXml(dfExtractedHierRR, fileNameXml)
        
        metrics.getMetric(f"{index}: Generate XML")
        
        fhirServiceLogger =  MatchLogger(f'XML Submission Logger_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        print(generatedXml)
        fhirServiceObj = FhirService(fhirServiceLogger, apiMmgtBaseUrl, addBundleApiEndPointUrlSuffix, apiMmgtSubsKey, basePath, generatedXml)
        fhirServiceObj.submitFhirXml()
        
        
        
        
        metrics.getMetric(f"{index}: Submit FHIR Msg")
        
        print(f"Created XML File For :- {fileNamePartitioned}")
        
        flowLogger.logFlowCheckpoint("Starting list bundle update/addition")
        if documentAnnotationObj.listRegulatedAuthorizationIdentifiers != None:
            for id in documentAnnotationObj.listRegulatedAuthorizationIdentifiers:
                listRegulatedAuthCodesAccrossePI.append(id)
        listBundleLogger =  MatchLogger(f'List Bundle Creation Logger_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        print("\nlistRegulatedAuthCodesAccrossePI",listRegulatedAuthCodesAccrossePI)
        try:
            listBundleHandler = ListBundleHandler(listBundleLogger,
                     domain,
                     procedureType,
                     index,
                     documentType,
                     languageCode,
                     medName,
                     controlBasePath,
                     jsonTempFileName,
                     listBundleDocumentTypeCodesFileName,
                     fileNameDocumentTypeNames,
                     listRegulatedAuthCodesAccrossePI,
                     apiMmgtBaseUrl,
                     getListApiEndPointUrlSuffix,
                     addUpdateListApiEndPointUrlSuffix,
                     apiMmgtSubsKey)

            listBundleXml = listBundleHandler.addOrUpdateDocumentItem(str(fhirServiceObj.SubmittedFhirMsgRefId))
            listBundleHandler.submitListXmLToServer(listBundleXml)

            flowLogger.logFlowCheckpoint("Completed list bundle update/addition")
            metrics.getMetric(f"{index}: Update/Add List Bundle")
            #return df,coll,dfExtractedHierRR
        except Exception as e:
            print(str(e))
            if 'No MAN Code found' in str(e):
                flowLogger.logFlowCheckpoint("Skipping list bundle addtion/update as no MAN found")
            
    
    flowLogger.logFlowCheckpoint("Completed Processing Partitioned Jsons")
    metrics.getMetric(f"{index}: Completed")
    metrics.end()

In [9]:
from wordToHtmlConvertor.wordToHtmlConvertor import WordToHtmlConvertor

wordToHtmlConvertorObj = WordToHtmlConvertor()
wordToHtmlConvertorObj.convertWordToHTML()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'F:\\Projects\\EMA\\Repository\\EMA EPI PoC\\function_code\\data\\Ingest'

In [10]:
# inputZipFolderPath = "F:\Projects\EMA\Repository\EMA EPI PoC\\function_code\\inputblob"
inputZipFolderPath = os.path.abspath(os.path.join('..'))
inputZipFolderPath = os.path.join(inputZipFolderPath, 'inputblob')
inputZipFileName = "Kalydeco~H~CAP~en~2021-05-19T09-22-20Z.zip"

In [11]:
fileNameQrd = 'qrd_canonical_model.csv'
fileNameMatchRuleBook = 'ruleDict.json'
fileNameDocumentTypeNames = 'documentTypeNames.json'
fsMountName = '/mounted'
jsonTempFileName = 'listBundleJsonTemplate.json'
listBundleDocumentTypeCodesFileName = 'listBundleDocumentTypeCodes.json'
apiMmgtBaseUrl = "https://ema-dap-epi-dev-fhir-apim.azure-api.net"
getListApiEndPointUrlSuffix = "/epi/v1/List"
addUpdateListApiEndPointUrlSuffix = "/epi-w/v1/List"
addBundleApiEndPointUrlSuffix = "/epi-w/v1/Bundle"
apiMmgtSubsKey = "ba6d7e9a73ed4facaa58fc983bf6db50"



info = inputZipFileName.split("~")

try:
    medName = info[0]
    domain = info[1]
    procedureType = info[2]
    languageCode = info[3]
    timestamp = info[4]
    timestamp = timestamp.replace(".zip","")

except Exception:
    raise f"Missing required info in the zip file name {inputZipFileName}"

if "\\" in os.getcwd():
    localEnv = True
    inputZipFolderPath = os.path.join(os.path.abspath(os.path.join('..')),inputZipFolderPath)
    outputFolderPath = os.path.join(os.path.abspath(os.path.join('..')), 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(os.path.abspath(os.path.join('..')),'control')
else:
    localEnv = False
    inputZipFolderPath = os.path.join(f'{fsMountName}',inputZipFolderPath)
    outputFolderPath = os.path.join(f'{fsMountName}', 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(f'{fsMountName}','control')


print(inputZipFileName, inputZipFolderPath, outputFolderPath, controlFolderPath)

mode = 0o666

if localEnv is True:
    inputZipFolderPath = inputZipFolderPath.replace("/","\\")
    outputFolderPath = outputFolderPath.replace("/","\\")
    controlFolderPath = controlFolderPath.replace("/","\\")

try:
    os.makedirs(inputZipFolderPath, mode)
    os.makedirs(outputFolderPath, mode)
    os.makedirs(controlFolderPath, mode)

except Exception:
    print("Already Present")
    
with zipfile.ZipFile(f'{inputZipFolderPath}/{inputZipFileName}',"r") as zip_ref:
        zip_ref.extractall(outputFolderPath)
    

_,_,fileNames = next(os.walk(outputFolderPath))
htmlFileName = [fileName for fileName in fileNames if ".htm" in fileName][0]

print(htmlFileName)



Kalydeco~H~CAP~en~2021-05-19T09-22-20Z.zip F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z F:\Projects\EMA\Repository\EMA EPI PoC\function_code\control
Already Present
Kalydeco_clean.htm


In [12]:
parseDocument(controlFolderPath,
              outputFolderPath,
              htmlFileName,
              fileNameQrd,
              fileNameMatchRuleBook,
              fileNameDocumentTypeNames,
              jsonTempFileName,
              listBundleDocumentTypeCodesFileName,
              apiMmgtBaseUrl,
              getListApiEndPointUrlSuffix,
              addUpdateListApiEndPointUrlSuffix,
              addBundleApiEndPointUrlSuffix,
              apiMmgtSubsKey,
              medName)

2021-06-07 18:35:22,345 : Flow Logger HTML_8 : Starting HTML Conversion To Json | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:22,359 : Style Dictionary_w : Reading style dictionary in file: rule_dictionary_en.json | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:22,436 : Style Dictionary_w : Qrd Section Keys Retrieved For Style Dictionary: ANNEX I, ANNEX II, ANNEX III, B. PACKAGE LEAFLET | H | CAP |  en | HTML | Kalydeco_clean.htm


['F:', 'Projects', 'EMA', 'Repository', 'EMA EPI PoC', 'function_code', 'work', 'H', 'CAP', 'Kalydeco', 'en', '2021-05-19T09-22-20Z'] Kalydeco_clean.htm
2021-05-19T09-22-20Z en Kalydeco CAP H
------------- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\outputJSON\Kalydeco_clean.txt -----------------
F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\Kalydeco_clean.htm F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\outputJSON\Kalydeco_clean.json


2021-06-07 18:35:26,343 : Parser_K : Style Information Stored In File: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\outputJSON\Kalydeco_clean.txt | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:42,113 : Parser_K : Writing to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\outputJSON\Kalydeco_clean.json | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:43,275 : Flow Logger HTML_8 : Completed HTML Conversion To Json | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:43,283 : Flow Logger HTML_8 : HTML Conversion To Json,0.349 Min,19.591699 MB,27.898397 MB,49.9%
 | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:43,315 : Flow Logger HTML_8 : Starting Json Split | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:43,321 : Style Dictionary_U : Reading style dictionary in file: rule_dictionary_en.json | H | CAP |  en | Json | Kalydeco

stylePath:- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\outputJSON\Kalydeco_clean.txt
Metrics : HTML Conversion To Json,0.349 Min,19.591699 MB,27.898397 MB,49.9%

PathJson F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\outputJSON\Kalydeco_clean.json


2021-06-07 18:35:44,083 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\partitionedJSONs\Kalydeco_clean_SmPC.json | H | CAP |  en | Json | Kalydeco_clean.json
2021-06-07 18:35:44,083 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\partitionedJSONs\Kalydeco_clean_SmPC.json | H | CAP |  en | Json | Kalydeco_clean.json
2021-06-07 18:35:44,369 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\partitionedJSONs\Kalydeco_clean_ANNEX II.json | H | CAP |  en | Json | Kalydeco_clean.json
2021-06-07 18:35:44,369 : Partition_Q : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\partitionedJSONs\Kalydeco_clean_ANNEX II.json | H | CAP |  en | Json | Kalydeco

Metrics : Split Json,0.0242 Min,0.137955 MB,18.250679 MB,49.8%

Starting Heading Extraction For File :- Kalydeco_clean_ANNEX II.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\partitionedJSONs\Kalydeco_clean_ANNEX II.json
--------------------------------------------
AnnexII


2021-06-07 18:35:45,212 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Started Extracting Heading | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json
2021-06-07 18:35:45,234 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- 'annex ii' | Qrd txt :- 'annex ii' | Matched :- 'True'
2021-06-07 18:35:45,234 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed In Lowercase  :  | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- 'Annex II' | Qrd txt :- 'ANNEX II' | Matched :- 'True'
2021-06-07 18:35:45,243 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed As This The First Heading | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21001' | prevHeadingCurrId :- '' | prevHeadingFoundId :- ''
2021-06-07 18:35:45,274 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed : Contains<>|118.75|(61, 91, 86)|0.807| | H | CAP |  en | 1 | Kaly


OriginalCheck



2021-06-07 18:35:45,430 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21005' | prevHeadingCurrId :- '21002' | prevHeadingFoundId :- '21002'
2021-06-07 18:35:45,523 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- 'C. OTHER CONDITIONS AND REQUIREMENTS OF THE MARKETING AUTHORISATION' | Qrd txt :- 'C. OTHER CONDITIONS AND REQUIREMENTS OF THE MARKETING AUTHORISATION' | Matched :- 'True'
2021-06-07 18:35:45,543 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Flow Is Broken | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21007' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '21005'
2021-06-07 18:35:45,556 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21007' | prevHeadingCurrId :- '21005'


OriginalCheck



2021-06-07 18:35:46,282 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : End Of Sub Section | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json
 RELEASE' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | Matched :- 'True'Kalydeco_clean_ANNEX II.json | Doc txt :- 'A.      MANUFACTURER(S) RESPONSIBLE FOR BATCH
2021-06-07 18:35:46,344 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed As This The First Heading | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21002' | prevHeadingCurrId :- '' | prevHeadingFoundId :- ''


oooooooooooooooooooooooooooooooooooooooo END OF Sub Section oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo

OriginalCheck



 address of the manufacturer(s) responsible for batch release' | Qrd txt :- 'A. <MANUFACTURER(S) OF THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND> MANUFACTURER(S) RESPONSIBLE FOR BATCH RELEASE' | Matched :- 'False''Name and
2021-06-07 18:35:46,599 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- 'Name and address of the manufacturer(s) responsible for batch release' | Qrd txt :- 'Name and address of the manufacturer(s) responsible for batch release' | Matched :- 'True'
2021-06-07 18:35:46,624 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Flow Is Broken | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21004' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '21002'
2021-06-07 18:35:46,634 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21004' | prevHeadingCurrId :- '21002' | prevHeadin


OriginalCheck



2021-06-07 18:35:49,172 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- 'B. CONDITIONS OR RESTRICTIONS REGARDING SUPPLY AND USE' | Qrd txt :- 'B. CONDITIONS OR RESTRICTIONS REGARDING SUPPLY AND USE' | Matched :- 'True'
2021-06-07 18:35:49,190 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21005' | prevHeadingCurrId :- '21004' | prevHeadingFoundId :- '21004'
 prescription (see Annex I: Summary of Product Characteristics, section 4.2).' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT' | Matched :- 'False'oduct subject to restricted medical



OriginalCheck



2021-06-07 18:35:50,046 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- 'c. conditions requirements marketing authorisation' | Qrd txt :- 'c. conditions requirements marketing authorisation' | Matched :- 'True'
 the Marketing Authorisation' | Qrd txt :- 'C. OTHER CONDITIONS AND REQUIREMENTS OF THE MARKETING AUTHORISATION' | Matched :- 'True'lydeco_clean_ANNEX II.json | Doc txt :- 'C.      Other conditions and requirements of
2021-06-07 18:35:50,071 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Flow Is Broken | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21007' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '21005'
2021-06-07 18:35:50,080 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21007' | prevHeadingCurrId :- '21005' | prevHeadingFoundId :- '21005'



OriginalCheck



2021-06-07 18:35:50,237 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed : <=7|2.56|(99, 100, 100)|0.995| | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- '·Periodic safety update reports (PSURs)' | Qrd txt :- 'Periodic safety update reports (PSURs)' | Matched :- 'True'
2021-06-07 18:35:50,254 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21008' | prevHeadingCurrId :- '21007' | prevHeadingFoundId :- '21007'
2021-06-07 18:35:50,462 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Match Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- 'd. conditions restrictions regard safe effective use medicinal product' | Qrd txt :- 'd. conditions restrictions regard safe effective use medicinal product' | Matched :- 'True'
 to the safe and effective use of the medicinal product' | Qrd txt :- 'D. CONDITIONS OR RESTRICTIONS WITH REGARD TO THE SAFE AND


OriginalCheck



 management plan (RMP)' | Qrd txt :- 'Risk management plan (RMP)' | Matched :- 'True'assed : <=4|3.7|(98, 100, 100)|0.993| | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- '·Risk
2021-06-07 18:35:50,721 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21010' | prevHeadingCurrId :- '21009' | prevHeadingFoundId :- '21009'
 to conduct post-authorisation measures' | Qrd txt :- 'Obligation to conduct post-authorisation measures' | Matched :- 'True'H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | Doc txt :- '·Obligation
2021-06-07 18:35:52,211 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Flow Is Broken | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json | currHeadId :- '21012' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '21010'
2021-06-07 18:35:52,222 : Heading Extraction Kalydeco_clean_ANNEX II.json_U : Validation Passed | H | CAP |  en | 1 | Kalydeco_clean_AN


OriginalCheck



Heading Not Found 
 ['Name and address of the manufacturer(s) of the biological active substance(s)', 'Official batch release', 'Additional risk minimisation measures', 'SPECIFIC OBLIGATION TO COMPLETE POST-AUTHORISATION MEASURES FOR\r\n<THE CONDITIONAL MARKETING AUTHORISATION> <THE MARKETING AUTHORISATION UNDER EXCEPTIONAL CIRCUMSTANCES>']


dict_keys([])
Completed Heading Extraction For File
Metrics : 1: Heading Extraction,0.134 Min,0.236124 MB,2.97192 MB,50.2%

Starting Document Annotation For File :- Kalydeco_clean_ANNEX II.json
Error Found No Authorization Code Found In The Document Kalydeco_clean_ANNEX II.json
Completed Document Annotation
Metrics : 1: Document Annotation,0.0003 Min,0.007459 MB,0.164858 MB,50.2%

Starting Extracting Content Between Heading For File :- Kalydeco_clean_ANNEX II.json


2021-06-07 18:35:52,906 : Flow Logger HTML_8 : Completed Extracting Content Between Heading | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:52,906 : Flow Logger HTML_8 : 1: Content Extraction,0.0011 Min,0.076706 MB,0.410876 MB,50.2%
 | H | CAP |  en | HTML | Kalydeco_clean.htm
2021-06-07 18:35:53,008 : XmlGeneration_1_3 : PMS/OMS Annotation Information Not Retrieved | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json
2021-06-07 18:35:53,008 : XmlGeneration_1_3 : Initiating XML Generation | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json


File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Kalydeco\en\2021-05-19T09-22-20Z\partitionedJSONs\Kalydeco_clean_ANNEX II.json
--------------------------------------------
Completed Extracting Content Between Heading
Metrics : 1: Content Extraction,0.0011 Min,0.076706 MB,0.410876 MB,50.2%

Already Exists


2021-06-07 18:35:53,169 : XmlGeneration_1_3 : Writing to File:Kalydeco_clean_ANNEX II.xml | H | CAP |  en | 1 | Kalydeco_clean_ANNEX II.json
2021-06-07 18:35:53,173 : Flow Logger HTML_8 : 1: Generate XML,0.0042 Min,0.542602 MB,1.104751 MB,49.9%
 | H | CAP |  en | HTML | Kalydeco_clean.htm


Metrics : 1: Generate XML,0.0042 Min,0.542602 MB,1.104751 MB,49.9%

b'<?xml version="1.0" encoding="UTF-8"?>\n<!-- This is a template for a FHIR resource, and needs items (marked with "${}") replacing to make a real instance -->\n<!-- The resulting instance is a Bundle of Bundles, each of which is a document (having a Composition, and supporting resources) -->\n<!-- 2020-02-22 -->\n<!-- This is for FHIR version R5 Preview 2 (May 2020) -->\n<Bundle xmlns="http://hl7.org/fhir">\n\t<type value="collection"/>\n\t<!-- Repeat at this level per document -->\n\t<entry>\n\t\t<fullUrl value="urn:uuid:bf6786dc-61e6-44f3-861a-9d9ae2ce2d40"/>\n\t\t<!-- Top level of each document is a also FHIR Bundle, of type "document"\n\t\t see http://hl7.org/fhir/documents.html, http://hl7.org/fhir/bundle.html\n\t \t All the other resources for this document are within this. -->\n\t\t<resource>\n            <Bundle>\n            \t<!-- When PUTing, some servers mandate an id here to match the existing id -->\n  

ConnectionError: HTTPSConnectionPool(host='ema-dap-epi-dev-fhir-apim.azure-api.netba6d7e9a73ed4facaa58fc983bf6db50', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016635120C40>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [63]:
convertCollectionToDataFrame(b)

Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex,htmlId,SubSectionIndex,doc_parent_id,heading_id
0,272,8001,CAP,,PRODUKTRESUMÉ,,PRODUKTRESUME,26,fd4ca558-9b66-42af-9ff7-f10552349731,0,,1
1,274,8003,CAP,1.0,LÆGEMIDLETS NAVN,8001.0,1. LÆGEMIDLETS NAVN,30,511e62b5-ad08-4d7a-9a0a-fa6d20151fe1,0,8001.0,3
2,275,8004,CAP,2.0,KVALITATIV OG KVANTITATIV SAMMENSÆTNING,8001.0,2. KVALITATIV OG KVANTITATIV\r SAMMENSÆTNING,35,8d1fd64c-8666-4593-bab5-17bd3db009ba,0,8001.0,4
3,279,8008,CAP,3.0,LÆGEMIDDELFORM,8001.0,3. LÆGEMIDDELFORM,45,e3c7f7e0-4daf-488d-aa20-91b284048b8a,0,8001.0,8
4,280,8009,CAP,4.0,KLINISKE OPLYSNINGER,8001.0,4. KLINISKE OPLYSNINGER,51,72c55e94-05d5-483a-882a-3379acbb27e8,0,8001.0,9
5,281,8010,CAP,4.1,Terapeutiske indikationer,8009.0,4.1 Terapeutiske indikationer,53,b9557958-672a-4aa0-b525-0deaf84a1c06,0,8009.0,10
6,287,8016,CAP,4.3,Kontraindikationer,8009.0,4.3 Kontraindikationer,93,87e2af9a-327c-4da7-b2ee-b4800c14a3da,0,8009.0,16
7,288,8017,CAP,4.4,Særlige advarsler og forsigtighedsregler vedrørende brugen,8009.0,4.4 Særlige advarsler og\r forsigtighedsregler vedrørende brugen,100,ac68dc25-6f81-43ca-a6fb-11f3d4f24fdd,0,8009.0,17
8,291,8020,CAP,4.5,Interaktion med andre lægemidler og andre former for interaktion,8009.0,4.5 Interaktion med andre lægemidler og\r andre former for interaktion,135,1460d2e3-2243-4459-b732-2890ab8e85e3,0,8009.0,20
9,293,8022,CAP,4.6,"Fertilitet, graviditet og amning",8009.0,"4.6 Fertilitet, graviditet og amning",153,b1321040-f717-4adb-9add-6a09d94d61f9,0,8009.0,22
