In [1]:
import os
import zipfile
%load_ext autoreload

%autoreload 2

In [2]:
import tracemalloc
import psutil
import pprint
import pandas as pd
import uuid
import json
import os
import glob
import re
import sys
from bs4 import NavigableString, BeautifulSoup
from collections import defaultdict
import random
import string
import time

from utils.config import config
from utils.logger.logger import loggerCreator

# ePI Modules
from parse.rulebook.rulebook import StyleRulesDictionary

from parse.extractor.parser import parserExtractor
from match.matchDocument.matchDocument import MatchDocument
from documentAnnotation.documentAnnotation import DocumentAnnotation
from htmlDocTypePartitioner.partition import DocTypePartitioner
from extractContentBetweenHeadings.dataBetweenHeadingsExtractor import DataBetweenHeadingsExtractor
from fhirXmlGenerator.fhirXmlGenerator import FhirXmlGenerator
from fhirService.fhirService import FhirService
from utils.logger.matchLogger import MatchLogger
from languageInfo.documentTypeNames.documentTypeNames import DocumentTypeNames
from wordToHtmlConvertor.wordToHtmlConvertor import WordToHtmlConvertor


class FolderNotFoundError(Exception):
    pass

class Metrics:
    
    def __init__(self, logFileName, logger):
        self.logFileName = logFileName
        self.start()
        self.writer = open(self.logFileName, 'a')
        self.writer.write("StepName,Time,Current Memory,Peak Memory,Used Ram Percentage\n")
        self.finalPeak = 0
        self.finalTotalTime = 0
        self.finalUsedRamPerc = 0
        self.logger = logger
    
    def start(self):
        self.startTime = time.time()
        tracemalloc.start()
    
    def getMetric(self, msg):
        
        self.endTime = time.time()
        
        self.totalTime = round((self.endTime - self.startTime)/60,3)
        
        
        current, peak = tracemalloc.get_traced_memory()
        current = current / 10**6
        peak = peak / 10**6
        
        usedRamPerc = psutil.virtual_memory()[2]
        
        self.finalPeak = max(self.finalPeak, peak)
        self.finalUsedRamPerc = max(self.finalUsedRamPerc, usedRamPerc)

        self.finalTotalTime = self.finalTotalTime + self.totalTime
        self.finalTotalTime = round(self.finalTotalTime/60,3)
        
        outputString = f"{msg},{self.totalTime} Min,{current} MB,{peak} MB,{usedRamPerc}\n"
        
        self.logger.logFlowCheckpoint(f"{outputString}")
        
        print(f"Metrics : {outputString}")
        self.writer.write(outputString)
        tracemalloc.stop()
        tracemalloc.start()
        self.startTime = time.time()
    def end(self):
        
        current, peak = tracemalloc.get_traced_memory()
        current = current / 10**6
        outputString = f"Final Metrics,{self.finalTotalTime} Min,{current} MB,{self.finalPeak} MB,{self.finalUsedRamPerc}\n"
        print(f"Metrics : {outputString}")
        self.logger.logFlowCheckpoint(f"{outputString}")
        self.finalTotalTime.write(outputString)
        self.writer.close()
        tracemalloc.stop()
        
        


def convertToInt(x):
    try:
        return str(int(x))
    except:
        return x


def convertCollectionToDataFrame(collection):

    dfExtractedHier = pd.DataFrame(collection)
    dfExtractedHier['parent_id'] = dfExtractedHier['parent_id'].apply(
        lambda x: convertToInt(x))
    dfExtractedHier['id'] = dfExtractedHier['id'].apply(
        lambda x: convertToInt(x))

    return dfExtractedHier

def getRandomString(N):
    str_ = ''.join(random.choice(string.ascii_uppercase + string.digits
                                 + string.ascii_lowercase) for _ in range(N))
    return str_


def convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog):

    module_path = os.path.join(basePath)

    if "/" in basePath:
        pathSep = "/"
    else:
        pathSep = "\\"
    
    # Generate output folder path
    output_json_path = os.path.join(basePath, 'outputJSON')

    """
        Check if input folder exists, else throw exception
    """
    if(os.path.exists(module_path)):
        filenames = glob.glob(os.path.join(module_path, htmlDocName))

        # Create language specific folder in outputJSON folder if it doesn't exist
        if(not os.path.exists(output_json_path)):
            os.mkdir(output_json_path)
        logger = MatchLogger(f'Parser_{getRandomString(1)}', htmlDocName,
                             domain, procedureType, languageCode, "HTML", fileNameLog)

        styleLogger = MatchLogger(
            f'Style Dictionary_{getRandomString(1)}', htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)

        styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                             controlBasePath=controlBasePath,
                                             language=languageCode,
                                             fileName=fileNameQrd,
                                             domain=domain,
                                             procedureType=procedureType
                                             )

        parserObj = parserExtractor(config, logger, styleRulesObj.styleRuleDict,
                                    styleRulesObj.styleFeatureKeyList,
                                    styleRulesObj.qrd_section_headings)

        for input_filename in filenames:
          # if(input_filename.find('Kalydeco II-86-PI-clean')!=-1):
            output_filename = os.path.join(output_json_path, htmlDocName)
            style_filepath =  output_filename.replace('.html','.txt')
            style_filepath =  style_filepath.replace('.txtl','.txt')
            style_filepath =  style_filepath.replace('.htm','.txt')
            print("-------------",style_filepath,"-----------------")

            output_filename = output_filename.replace('.html', '.json')
            output_filename = output_filename.replace('.htm', '.json')
            print(input_filename, output_filename)
            parserObj.createPIJsonFromHTML(input_filepath=input_filename,
                                           output_filepath=output_filename,
                                           style_filepath = style_filepath,
                                           img_base64_dict=parserObj.convertImgToBase64(input_filename)
                                           )
            
        return output_filename.split(pathSep)[-1], style_filepath
    else:
        try:    
            raise FolderNotFoundError(module_path + " not found")
        except:  
            logger.logFlowCheckpoint("Folder For Language Code Not Found In Input File")
            logger.logException("Folder For Language Code Not Found In Input File")
        raise FolderNotFoundError(module_path + " not found")
        return None


def splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog):

    styleLogger = MatchLogger(
        f'Style Dictionary_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                        controlBasePath=controlBasePath,
                                        language=languageCode,
                                        fileName=fileNameQrd,
                                        domain=domain,
                                        procedureType=procedureType
                                        )
    
    path_json = os.path.join(basePath,'outputJSON', fileNameJson)
    print("PathJson",path_json)
    partitionLogger = MatchLogger(
        f'Partition_{getRandomString(1)}', fileNameJson, domain, procedureType, languageCode, "Json", fileNameLog)

    partitioner = DocTypePartitioner(partitionLogger)

    partitionedJsonPaths = partitioner.partitionHtmls(
        styleRulesObj.qrd_section_headings, path_json)

    return partitionedJsonPaths


def extractAndValidateHeadings(controlBasePath,
                                basePath,
                                domain,
                                procedureType,
                                languageCode,
                                documentNumber,
                                fileNameDoc,
                                fileNameQrd,
                                fileNameMatchRuleBook,
                                fileNameDocumentTypeNames,
                                fileNameLog,
                                stopWordFilterLen=6,
                                isPackageLeaflet=False,
                                medName=None
                                ):

    if documentNumber == 0:
        topHeadingsConsidered = 4
        bottomHeadingsConsidered = 6
    elif documentNumber == 1:
        topHeadingsConsidered = 3
        bottomHeadingsConsidered = 5
    elif documentNumber == 2:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 15
    else:
        topHeadingsConsidered = 5
        bottomHeadingsConsidered = 10

    print(f"Starting Heading Extraction For File :- {fileNameDoc}")
    logger = MatchLogger(f"Heading Extraction {fileNameDoc}_{getRandomString(1)}", fileNameDoc, domain, procedureType, languageCode, documentNumber, fileNameLog)
    logger.logFlowCheckpoint("Starting Heading Extraction")

    stopWordlanguage = DocumentTypeNames(
        controlBasePath=controlBasePath,
        fileNameDocumentTypeNames=fileNameDocumentTypeNames,
        languageCode=languageCode,
        domain=domain,
        procedureType=procedureType,
        documentNumber=documentNumber
        ).extractStopWordLanguage()

    matchDocObj = MatchDocument(
        logger,
        controlBasePath,
        basePath,
        domain,
        procedureType,
        languageCode,
        documentNumber,
        fileNameDoc,
        fileNameQrd,
        fileNameMatchRuleBook,
        fileNameDocumentTypeNames,
        topHeadingsConsidered,
        bottomHeadingsConsidered,
        stopWordFilterLen,
        stopWordlanguage,
        isPackageLeaflet,
        medName)
    df, coll, documentType = matchDocObj.matchHtmlHeaddingsWithQrd()

    return df, coll, documentType


def parseDocument(controlBasePath, basePath ,htmlDocName, fileNameQrd, fileNameMatchRuleBook, fileNameDocumentTypeNames, medName = None):
    
    
    if "/" in basePath:
        pathSep = "/"        
    else:
        pathSep = "\\"
    
    fileNameLog = os.path.join(basePath,'FinalLog.txt')

    pathComponents = basePath.split(pathSep)
    print(pathComponents, htmlDocName)
    timestamp = pathComponents[-1]
    languageCode =  pathComponents[-2]
    medName = pathComponents[-3]
    procedureType = pathComponents[-4]
    domain = pathComponents[-5]

    print(timestamp, languageCode, medName, procedureType, domain)
        
    flowLogger =  MatchLogger(f"Flow Logger HTML_{getRandomString(1)}", htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)
    
    metrics = Metrics(os.path.join(basePath,'Metrics.csv'),flowLogger)
    
    
    flowLogger.logFlowCheckpoint("Starting HTML Conversion To Json")
    ###Convert Html to Json
    fileNameJson, stylesFilePath = convertHtmlToJson(controlBasePath, basePath, domain, procedureType, languageCode, htmlDocName, fileNameQrd, fileNameLog)
    
    print("stylePath:-",stylesFilePath)
    flowLogger.logFlowCheckpoint("Completed HTML Conversion To Json")
    metrics.getMetric("HTML Conversion To Json")

    flowLogger.logFlowCheckpoint("Starting Json Split")

    ###Split Uber Json to multiple Jsons for each category.
    partitionedJsonPaths = splitJson(controlBasePath, basePath, domain, procedureType, languageCode, fileNameJson, fileNameQrd, fileNameLog)
    
    partitionedJsonPaths = [ path.split(pathSep)[-1] for path in partitionedJsonPaths]
    flowLogger.logFlowCheckpoint(str(partitionedJsonPaths))
    
    flowLogger.logFlowCheckpoint("Completed Json Split")
    metrics.getMetric("Split Json")
    
    flowLogger.logFlowCheckpoint("Started Processing Partitioned Jsons")
    
    for index, fileNamePartitioned in enumerate(partitionedJsonPaths):
        #print("Index", index)
        #if index == 0:
        #    print("Asdddddddddddddddddddddddddd")
        #    continue
        flowLogger.logFlowCheckpoint(f"\n\n\n\n||||||||||||||||||||||||||||||||{str(index)} ||||| {str(fileNamePartitioned)}||||||||||||||||||||||||||||||||\n\n\n\n")
        
        if index == 3:
            stopWordFilterLen = 100
            isPackageLeaflet = True
        else:
            stopWordFilterLen = 6
            isPackageLeaflet = False
            
        df, coll, documentType = extractAndValidateHeadings(controlBasePath,
                                    basePath,
                                    domain,
                                    procedureType,
                                    languageCode,
                                    index,
                                    fileNamePartitioned,
                                    fileNameQrd,
                                    fileNameMatchRuleBook,
                                    fileNameDocumentTypeNames,
                                    fileNameLog,
                                    stopWordFilterLen=stopWordFilterLen,
                                    isPackageLeaflet=isPackageLeaflet,
                                    medName=medName)
        
        
        print(f"Completed Heading Extraction For File")
        flowLogger.logFlowCheckpoint("Completed Heading Extraction For File")
        metrics.getMetric(f"{index}: Heading Extraction")

        print(f"Starting Document Annotation For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Document Annotation For File")
        documentAnnotationObj = DocumentAnnotation(fileNamePartitioned,'c20835db4b1b4e108828a8537ff41506','https://spor-sit.azure-api.net/pms/api/v2/',df,coll)
        try:
            pms_oms_annotation_data = documentAnnotationObj.processRegulatedAuthorizationForDoc()
            print(pms_oms_annotation_data)
        except:
            pms_oms_annotation_data = None
            print("Error Found")
            
        print(f"Completed Document Annotation")        
        flowLogger.logFlowCheckpoint("Completed Document Annotation")
        metrics.getMetric(f"{index}: Document Annotation")
        
        print(f"Starting Extracting Content Between Heading For File :- {fileNamePartitioned}")        
        flowLogger.logFlowCheckpoint("Starting Extracting Content Between Heading")
        
        extractContentlogger =  MatchLogger(f'ExtractContentBetween_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        extractorObj = DataBetweenHeadingsExtractor(extractContentlogger, basePath, coll)
        dfExtractedHierRR = extractorObj.extractContentBetweenHeadings(fileNamePartitioned)
        
        print(f"Completed Extracting Content Between Heading")        
        flowLogger.logFlowCheckpoint("Completed Extracting Content Between Heading")
        metrics.getMetric(f"{index}: Content Extraction")
        
        xmlLogger =  MatchLogger(f'XmlGeneration_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)
        fhirXmlGeneratorObj = FhirXmlGenerator(xmlLogger, controlBasePath, basePath, pms_oms_annotation_data, stylesFilePath, medName)
        fileNameXml = fileNamePartitioned.replace('.json','.xml')
        generatedXml = fhirXmlGeneratorObj.generateXml(dfExtractedHierRR, fileNameXml)
        
        metrics.getMetric(f"{index}: Generate XML")
        
        fhirServiceLogger =  MatchLogger(f'XML Submission Logger_{index}_{getRandomString(1)}', fileNamePartitioned, domain, procedureType, languageCode, index, fileNameLog)

        fhirServiceObj = FhirService(fhirServiceLogger, basePath, generatedXml)
        fhirServiceObj.submitFhirXml()
        
        metrics.getMetric(f"{index}: Submit FHIR Msg")
        
        print(f"Created XML File For :- {fileNamePartitioned}")      
        
        #return df,coll,dfExtractedHierRR
    
    
    flowLogger.logFlowCheckpoint("Completed Processing Partitioned Jsons")
    metrics.getMetric(f"{index}: Completed")
    metrics.end()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vipsharm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
wordToHtmlConvertorObj = WordToHtmlConvertor()
wordToHtmlConvertorObj.convertWordToHTML()

2021-05-19 00:50:50,658 : WordToHtmlLogger_M : Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:50:50,659 : WordToHtmlLogger_M : Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\ABASAGLAR\en\2021-05-18T19-20-50Z\ABASAGLAR_clean | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx


Word Files in folder:  ['ABASAGLAR~H~CAP~en.docx', 'Abilify Maintena~H~CAP~en.doc', 'ABILIFY~H~CAP~en.doc', 'Adakveo~H~CAP~en.docx', 'Adcetris~H~CAP~en.doc', '~$ilify Maintena~H~CAP~en.doc']
Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx
Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\ABASAGLAR\en\2021-05-18T19-20-50Z\ABASAGLAR_clean


2021-05-19 00:51:09,654 : WordToHtmlLogger_M : Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx


Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\ABASAGLAR~H~CAP~en.docx


2021-05-19 00:51:11,212 : WordToHtmlLogger_M : Starting document cleaning process | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx



Checking table 20
The selection starts on page 105 of 106 (69.44999694824219/70.75)
The selection ends on page 105 of 106 (384.8500061035156/70.75)
The selection contains
* overlay images

Checking table 19
The selection starts on page 103 of 106 (579.2000122070312/70.75)
The selection ends on page 105 of 106 (56.70000076293945/70.75)
The selection contains
* overlay images

Checking table 18
The selection starts on page 102 of 105 (564.5499877929688/214.75)
The selection ends on page 103 of 105 (421.79998779296875/70.75)
The selection contains
* overlay images
* overlay shapes

Checking table 17
The selection starts on page 102 of 105 (56.70000076293945/70.75)
The selection ends on page 102 of 105 (475.95001220703125/70.75)
The selection contains
* overlay images

Checking table 16
The selection starts on page 100 of 105 (577.9000244140625/70.75)
The selection ends on page 101 of 105 (484.25/70.75)
The selection contains
* overlay images

Checking table 15
The selection starts on pag

2021-05-19 00:51:51,072 : WordToHtmlLogger_M : Completed document cleaning process | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:51:51,078 : WordToHtmlLogger_M : Preparing zip file | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:51:52,742 : WordToHtmlLogger_M : Zip file created: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zip | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:51:52,746 : WordToHtmlLogger_M : Uploading to Azure Storage as blob:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zip | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx



Uploading File to  Azure Storage:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zip


2021-05-19 00:52:05,189 : WordToHtmlLogger_M : Uploaded F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\ABASAGLAR~H~CAP~en~2021-05-18T19-20-50Z.zipsuccessfully | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:52:05,190 : WordToHtmlLogger_M : Deleting input word file: ABASAGLAR~H~CAP~en.docx | H | CAP |  en | .docx | ABASAGLAR~H~CAP~en.docx
2021-05-19 00:52:05,217 : WordToHtmlLogger_2 : Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:52:05,218 : WordToHtmlLogger_2 : Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Abilify Maintena\en\2021-05-18T19-22-05Z\Abilify Maintena_clean | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc


Input file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc
Output file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Abilify Maintena\en\2021-05-18T19-22-05Z\Abilify Maintena_clean


2021-05-19 00:54:18,892 : WordToHtmlLogger_2 : Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc


Opened file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\data\Ingest\Abilify Maintena~H~CAP~en.doc


2021-05-19 00:54:20,499 : WordToHtmlLogger_2 : Starting document cleaning process | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc



Checking table 23
The selection starts on page 95 of 95 (619.3499755859375/70.75)
The selection ends on page 95 of 95 (737.2000122070312/70.75)
The selection contains
* inline images

Checking table 22
The selection starts on page 95 of 95 (303.29998779296875/70.75)
The selection ends on page 95 of 95 (480.3999938964844/70.75)
The selection contains
* inline images

Checking table 21
The selection starts on page 95 of 95 (107.05000305175781/70.75)
The selection ends on page 95 of 95 (227.4499969482422/70.75)
The selection contains
* inline images

Checking table 20
The selection starts on page 94 of 95 (653.0999755859375/70.75)
The selection ends on page 95 of 95 (56.70000076293945/70.75)
The selection contains
* inline images

Checking table 19
The selection starts on page 94 of 95 (494.45001220703125/70.75)
The selection ends on page 94 of 95 (627.5999755859375/70.75)
The selection contains
* inline images

Checking table 18
The selection starts on page 94 of 95 (353.0/70.75)
The se

2021-05-19 00:54:46,529 : WordToHtmlLogger_2 : Completed document cleaning process | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:46,532 : WordToHtmlLogger_2 : Preparing zip file | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:47,821 : WordToHtmlLogger_2 : Zip file created: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zip | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:47,825 : WordToHtmlLogger_2 : Uploading to Azure Storage as blob:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zip | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc



Uploading File to  Azure Storage:
	F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zip


2021-05-19 00:54:59,563 : WordToHtmlLogger_2 : Uploaded F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob\Abilify Maintena~H~CAP~en~2021-05-18T19-22-05Z.zipsuccessfully | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:59,564 : WordToHtmlLogger_2 : Deleting input word file: Abilify Maintena~H~CAP~en.doc | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc
2021-05-19 00:54:59,573 : WordToHtmlLogger_2 : Killing Word processes as exception was raised | H | CAP |  en | .doc | Abilify Maintena~H~CAP~en.doc


Exception raised
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE
Killing WINWORD.EXE


NoSuchProcess: psutil.NoSuchProcess process no longer exists (pid=34304)

In [3]:
# inputZipFolderPath = "F:\Projects\EMA\Repository\EMA EPI PoC\\function_code\\inputblob"
inputZipFolderPath = os.path.abspath(os.path.join('..'))
inputZipFolderPath = os.path.join(inputZipFolderPath, 'inputblob')
inputZipFileName = "Zynteglo~H~CAP~en~2021-05-19T07-52-54Z.zip"

In [4]:
fileNameQrd = 'qrd_canonical_model.csv'
fileNameMatchRuleBook = 'ruleDict.json'
fileNameDocumentTypeNames = 'documentTypeNames.json'
fsMountName = '/mounted'

info = inputZipFileName.split("~")

try:
    medName = info[0]
    domain = info[1]
    procedureType = info[2]
    languageCode = info[3]
    timestamp = info[4]
    timestamp = timestamp.replace(".zip","")

except Exception:
    raise f"Missing required info in the zip file name {inputZipFileName}"

if "\\" in os.getcwd():
    localEnv = True
    inputZipFolderPath = os.path.join(os.path.abspath(os.path.join('..')),inputZipFolderPath)
    outputFolderPath = os.path.join(os.path.abspath(os.path.join('..')), 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(os.path.abspath(os.path.join('..')),'control')
else:
    localEnv = False
    inputZipFolderPath = os.path.join(f'{fsMountName}',inputZipFolderPath)
    outputFolderPath = os.path.join(f'{fsMountName}', 'work', f"{domain}", f"{procedureType}", f"{medName}", f"{languageCode}", f"{timestamp}")
    controlFolderPath = os.path.join(f'{fsMountName}','control')


print(inputZipFileName, inputZipFolderPath, outputFolderPath, controlFolderPath)

mode = 0o666

if localEnv is True:
    inputZipFolderPath = inputZipFolderPath.replace("/","\\")
    outputFolderPath = outputFolderPath.replace("/","\\")
    controlFolderPath = controlFolderPath.replace("/","\\")

try:
    os.makedirs(inputZipFolderPath, mode)
    os.makedirs(outputFolderPath, mode)
    os.makedirs(controlFolderPath, mode)

except Exception:
    print("Already Present")
    
with zipfile.ZipFile(f'{inputZipFolderPath}/{inputZipFileName}',"r") as zip_ref:
        zip_ref.extractall(outputFolderPath)
    

_,_,fileNames = next(os.walk(outputFolderPath))
htmlFileName = [fileName for fileName in fileNames if ".htm" in fileName][0]

print(htmlFileName)



Zynteglo~H~CAP~en~2021-05-19T07-52-54Z.zip F:\Projects\EMA\Repository\EMA EPI PoC\function_code\inputblob F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z F:\Projects\EMA\Repository\EMA EPI PoC\function_code\control
Already Present
Zynteglo_clean.htm


In [30]:
parseDocument(controlFolderPath, outputFolderPath, htmlFileName, fileNameQrd, fileNameMatchRuleBook, fileNameDocumentTypeNames, medName)

2021-05-19 05:08:05,512 : Flow Logger HTML_Z : Starting HTML Conversion To Json | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:05,518 : Style Dictionary_Q : Reading style dictionary in file: rule_dictionary_en.json | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:05,551 : Style Dictionary_Q : Qrd Section Keys Retrieved For Style Dictionary: ANNEX I, ANNEX II, ANNEX III, B. PACKAGE LEAFLET | H | CAP |  en | HTML | Zynteglo_clean.htm


['F:', 'Projects', 'EMA', 'Repository', 'EMA EPI PoC', 'function_code', 'work', 'H', 'CAP', 'Zynteglo', 'en', '2021-05-19T07-52-54Z'] Zynteglo_clean.htm
2021-05-19T07-52-54Z en Zynteglo CAP H
------------- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt -----------------
F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\Zynteglo_clean.htm F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.json


2021-05-19 05:08:05,989 : Parser_q : Style Information Stored In File: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:05,989 : Parser_q : Style Information Stored In File: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:05,989 : Parser_q : Style Information Stored In File: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:08,181 : Parser_q : Writing to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.json | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:08,181 : Parser_q : Writing to 

stylePath:- F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.txt
Metrics : HTML Conversion To Json,0.047 Min,15.175288 MB,35.821356 MB,67.5

PathJson F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\outputJSON\Zynteglo_clean.json


2021-05-19 05:08:08,593 : Partition_U : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_ANNEX III.json | H | CAP |  en | Json | Zynteglo_clean.json
2021-05-19 05:08:08,600 : Partition_U : Writing partition to file: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_ PACKAGE LEAFLET.json | H | CAP |  en | Json | Zynteglo_clean.json
2021-05-19 05:08:08,610 : Flow Logger HTML_Z : ['Zynteglo_clean_SmPC.json', 'Zynteglo_clean_ANNEX II.json', 'Zynteglo_clean_ANNEX III.json', 'Zynteglo_clean_ PACKAGE LEAFLET.json'] | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:08,611 : Flow Logger HTML_Z : Completed Json Split | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08:08,612 : Flow Logger HTML_Z : Started Processing Partitioned Jsons | H | CAP |  en | HTML | Zynteglo_clean.htm
2021-05-19 05:08

Metrics : Split Json,0.004 Min,0.296236 MB,8.739354 MB,67.4

Starting Heading Extraction For File :- Zynteglo_clean_SmPC.json
File being processed: F:\Projects\EMA\Repository\EMA EPI PoC\function_code\work\H\CAP\Zynteglo\en\2021-05-19T07-52-54Z\partitionedJSONs\Zynteglo_clean_SmPC.json
--------------------------------------------
SmPC


2021-05-19 05:08:08,977 : Heading Extraction Zynteglo_clean_SmPC.json_h : Started Extracting Heading | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json
2021-05-19 05:08:09,143 : Heading Extraction Zynteglo_clean_SmPC.json_h : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'SUMMARY OF PRODUCT CHARACTERISTICS' | Qrd txt :- 'SUMMARY OF PRODUCT CHARACTERISTICS' | Matched :- 'True'
2021-05-19 05:08:09,148 : Heading Extraction Zynteglo_clean_SmPC.json_h : Validation Passed As This The First Heading | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20001' | prevHeadingCurrId :- '' | prevHeadingFoundId :- ''
2021-05-19 05:08:09,304 : Heading Extraction Zynteglo_clean_SmPC.json_h : Match Passed : checkLowerCase|2.88|(99, 100, 99)|0.919| | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'medicinal product subject additional monitoring. allow quick identification new safety information. healthcare professionals asked report suspected adverse reactions


OriginalCheck

----------------------------------
RemovedByStyle
----------------------------------


 dispersion for infusion. ' | Qrd txt :- '6.6 Special precautions for disposal <and other handling>' | Matched :- 'False'0.62| | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Zynteglo 1.2‑20 × 106 cells/mL
2021-05-19 05:08:09,724 : Heading Extraction Zynteglo_clean_SmPC.json_h : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '2. QUALITATIVE AND QUANTITATIVE COMPOSITION' | Qrd txt :- '2. QUALITATIVE AND QUANTITATIVE COMPOSITION' | Matched :- 'True'
2021-05-19 05:08:09,733 : Heading Extraction Zynteglo_clean_SmPC.json_h : Validation Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20004' | prevHeadingCurrId :- '20003' | prevHeadingFoundId :- '20003'
2021-05-19 05:08:09,759 : Heading Extraction Zynteglo_clean_SmPC.json_h : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '2.1 General description' | Qrd txt :- '2.1 General description' | Matched :- 'True'
2021-05-19 05:08:09,768 : Heading Extraction Zynteg


OriginalCheck



2021-05-19 05:08:10,190 : Heading Extraction Zynteglo_clean_SmPC.json_h : Match Passed : <=4|11.11|(95, 89, 95)|0.989| | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Excipient with known effect' | Qrd txt :- 'Excipient(s) with known effect' | Matched :- 'True'
2021-05-19 05:08:10,202 : Heading Extraction Zynteglo_clean_SmPC.json_h : Validation Failed As Current H3 Heading Is Not Part Of Valid H3 Headings in Previous H2 | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20007' | prevHeadingCurrId :- '20006' | prevHeadingFoundId :- '20006'
2021-05-19 05:08:10,854 : Heading Extraction Zynteglo_clean_SmPC.json_h : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- '3. PHARMACEUTICAL FORM' | Qrd txt :- '3. PHARMACEUTICAL FORM' | Matched :- 'True'
2021-05-19 05:08:10,866 : Heading Extraction Zynteglo_clean_SmPC.json_h : Validation Flow Is Broken | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20008' | prevHeadingCurrId :- '' | 

2021-05-19 05:08:14,976 : Heading Extraction Zynteglo_clean_SmPC.json_h : Validation Flow Is Broken | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20031' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '20011'
2021-05-19 05:08:14,990 : Heading Extraction Zynteglo_clean_SmPC.json_h : Validation Failed As Current H3 Heading Is Not Part Of Valid H3 Headings in Previous H2 | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20031' | prevHeadingCurrId :- '20011' | prevHeadingFoundId :- '20011'
2021-05-19 05:08:15,009 : Heading Extraction Zynteglo_clean_SmPC.json_h : Match Passed | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | Doc txt :- 'Paediatric population' | Qrd txt :- 'Paediatric population' | Matched :- 'True'
2021-05-19 05:08:15,020 : Heading Extraction Zynteglo_clean_SmPC.json_h : Validation Flow Is Broken | H | CAP |  en | 0 | Zynteglo_clean_SmPC.json | currHeadId :- '20037' | prevHeadingCurrId :- '' | prevHeadingFoundId :- '20011'
2021-05-19 05:08:1

KeyboardInterrupt: 

In [20]:
a

Unnamed: 0,Bold,Classes,Element,HasBorder,ID,Indexed,IsHeadingType,IsListItem,IsPossibleHeading,Italics,ParentId,Styles,Text,Underlined,Uppercased,StringLength
0,True,['MsoNormal'],"<p align=""center"" class=""MsoNormal"" style=""margin-top:0in;margin-right:-.1pt;\r margin-bottom:0in;margin-left:27.0pt;text-align:center;text-indent:-27.0pt;\r line-height:normal""><b><span lang=""EN-...",False,320efb82-a156-45a3-afdd-0e3a7e54d6da,False,,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:-.1pt;\r\nmargin-bottom:0in;margin-left:27.0pt;text-align:center;text-indent:-27.0pt;\r\nline-height:normal,ANNEX\r II,False,True,8
1,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:-.1pt;margin-bottom:0in;\r margin-left:0in;line-height:normal""></p>",False,f79ed8ed-bfc4-4e8b-bd49-ede302291e8a,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:-.1pt;margin-bottom:0in;\r\nmargin-left:0in;line-height:normal,,False,False,0
2,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,7b2ff4b3-f27a-47ca-96a9-7ae24cfdefab,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,A. MANUFACTURER(S) OF\r THE BIOLOGICAL ACTIVE SUBSTANCE(S) AND MANUFACTURER(S) RESPONSIBLE FOR BATCH\r RELEASE,False,True,106
3,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,d320b69e-a3c5-4764-aa31-251a32af8a17,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0
4,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,e67b4687-f1a0-4608-988a-bcbb07029721,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,B. CONDITIONS OR\r RESTRICTIONS REGARDING SUPPLY AND USE,False,True,54
5,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,a9013513-966a-4638-818e-90def54e0698,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0
6,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,f9007321-3055-4120-b8ed-79e72cb470ae,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,C. OTHER CONDITIONS AND\r REQUIREMENTS OF THE MARKETING AUTHORISATION,False,True,67
7,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,1b70d864-ef01-4bad-91f3-54dcbbe56cef,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0
8,True,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""><b><span lang=""EN-GB"" style='font-family:""Times New ...",False,71e9cc22-be86-4f43-b932-f89b799916ba,True,L1,False,True,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,D. CONDITIONS OR\r RESTRICTIONS WITH REGARD TO THE SAFE AND EFFECTIVE USE OF THE MEDICINAL PRODUCT,False,True,96
9,False,['MsoNormal'],"<p class=""MsoNormal"" style=""margin-top:0in;margin-right:70.8pt;margin-bottom:\r 0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal""></p>",False,d7fd1fa1-ea7c-4b20-8b1b-11cbcafde70c,False,,False,False,False,e337b19e-80aa-4ffa-ac3b-5d9fa03a7fe4,margin-top:0in;margin-right:70.8pt;margin-bottom:\r\n0in;margin-left:85.05pt;text-indent:-35.4pt;line-height:normal,,False,False,0


In [14]:
convertCollectionToDataFrame(b)

Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex,htmlId,SubSectionIndex,doc_parent_id
0,680,20001,CAP,,SUMMARY OF PRODUCT CHARACTERISTICS,,SUMMARY\r OF PRODUCT CHARACTERISTICS,26,7b37353a-5e3e-400a-9acf-6b44224c82e4,0,
1,682,20003,CAP,1.0,NAME OF THE MEDICINAL PRODUCT,20001.0,1. NAME OF THE\r MEDICINAL PRODUCT,33,62ecb297-5186-4c91-8144-6540859f879b,0,20001.0
2,683,20004,CAP,2.0,QUALITATIVE AND QUANTITATIVE COMPOSITION,20001.0,2. QUALITATIVE AND\r QUANTITATIVE COMPOSITION,38,5003c211-0a0d-4c38-8258-71d672ed3498,0,20001.0
3,684,20005,CAP,2.1,General description,20004.0,2.1 General description,40,ae7ffbb8-396b-4813-95f5-9f3eb67bcfa3,0,20004.0
4,685,20006,CAP,2.2,Qualitative and quantitative composition,20004.0,2.2 Qualitative and\r quantitative composition,44,054b75a3-b7a0-4877-b8f4-65cd9f0b5bc7,0,20004.0
5,686,20007,CAP,,Excipient(s) with known effect,20006.0,Excipient with known effect,50,708fa882-ab8b-49b7-8745-a2757401ec2f,0,20006.0
6,687,20008,CAP,3.0,PHARMACEUTICAL FORM,20001.0,3. PHARMACEUTICAL\r FORM,57,cd25c04e-2fec-477f-a67d-b894d46f03c9,0,20001.0
7,688,20009,CAP,4.0,CLINICAL PARTICULARS,20001.0,4. CLINICAL\r PARTICULARS,64,12a07622-b5ed-483d-bda8-96814b599bc9,0,20001.0
8,689,20010,CAP,4.1,Therapeutic indications,20009.0,4.1 Therapeutic\r indication,66,4dcf34cc-26ef-4720-87fa-435048832ef7,0,20009.0
9,690,20011,CAP,4.2,Posology and method of administration,20009.0,4.2 Posology and\r method of administration,70,68b9ace8-fc3c-429f-92b8-e983e9d1e388,0,20009.0


In [28]:
dfCanonicalModel = pd.read_csv(f'{controlFolderPath}\\qrdTemplate\\{fileNameQrd}', encoding= 'utf-8')

colsofInterest  = ['id','domain','Procedure type', 'Document type', 'Language code',
'Display code', 'Name', 'parent_id', 'Mandatory','heading_id']

dfCanonicalModel = dfCanonicalModel[colsofInterest]
dfCanonicalModel['document_number'] = None

dfCanonicalModel

Unnamed: 0,id,domain,Procedure type,Document type,Language code,Display code,Name,parent_id,Mandatory,heading_id,document_number
0,1,H,CAP,SmPC,bg,,КРАТКА ХАРАКТЕРИСТИКА НА ПРОДУКТА,,True,1,
1,2,H,CAP,SmPC,bg,,qТози лекарствен продукт подлежи на допълнително наблюдение. Това ще позволи бързото установяване на нова информация относно безопасността. От медицинските специалисти се изисква да съобщават всяк...,1.0,False,2,
2,3,H,CAP,SmPC,bg,1,ИМЕ НА ЛЕКАРСТВЕНИЯ ПРОДУКТ,1.0,True,3,
3,4,H,CAP,SmPC,bg,2,КАЧЕСТВЕН И КОЛИЧЕСТВЕН СЪСТАВ,1.0,True,4,
4,5,H,CAP,SmPC,bg,2.1,Общо описание,4.0,False,5,
5,6,H,CAP,SmPC,bg,2.2,Качествен и количествен състав,4.0,False,6,
6,7,H,CAP,SmPC,bg,,Помощно(и) вещество(а) с известно действие,6.0,False,7,
7,8,H,CAP,SmPC,bg,3,ЛЕКАРСТВЕНА ФОРМА,1.0,True,8,
8,9,H,CAP,SmPC,bg,4,КЛИНИЧНИ ДАННИ,1.0,True,9,
9,10,H,CAP,SmPC,bg,4.1,Терапевтични показания,9.0,True,10,


In [22]:
dfCanonicalModel['']
display(dfCanonicalModel[(dfCanonicalModel['Document type'] == 'SmPC')][(dfCanonicalModel['Language code'] == 'en')])


  


Unnamed: 0,id,domain,Procedure type,Document type,Language code,Display code,Name,parent_id,Mandatory,heading_id,document_number
680,20001,H,CAP,SmPC,en,,SUMMARY OF PRODUCT CHARACTERISTICS,,True,1,0
681,20002,H,CAP,SmPC,en,,qThis medicinal product is subject to additional monitoring. This will allow quick identification of new safety information. Healthcare professionals are asked to report any suspected adverse reac...,20001.0,False,2,0
682,20003,H,CAP,SmPC,en,1.0,NAME OF THE MEDICINAL PRODUCT,20001.0,True,3,0
683,20004,H,CAP,SmPC,en,2.0,QUALITATIVE AND QUANTITATIVE COMPOSITION,20001.0,True,4,0
684,20005,H,CAP,SmPC,en,2.1,General description,20004.0,False,5,0
685,20006,H,CAP,SmPC,en,2.2,Qualitative and quantitative composition,20004.0,False,6,0
686,20007,H,CAP,SmPC,en,,Excipient(s) with known effect,20006.0,False,7,0
687,20008,H,CAP,SmPC,en,3.0,PHARMACEUTICAL FORM,20001.0,True,8,0
688,20009,H,CAP,SmPC,en,4.0,CLINICAL PARTICULARS,20001.0,True,9,0
689,20010,H,CAP,SmPC,en,4.1,Therapeutic indications,20009.0,True,10,0


In [29]:
dfCanonicalModel['document_number'] = self.documentNumber if dfCanonicalModel['Document type'] == 'SmPC' else None

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().