In [1]:
import unittest
import os
from parse.extractor.parser import parserExtractor
from parse.rulebook.rulebook import StyleRulesDictionary
from utils.logger.matchLogger import MatchLogger
from utils.config import config
from collections import defaultdict
from bs4 import NavigableString, BeautifulSoup
import uuid
import json
from scripts.jsonHandlingUtils import loadJSON_Convert_to_DF, mkdir, addjson
import pandas as pd
import random
import string


class TestParserExtractor(unittest.TestCase):
    
    
    def createHtmlDataframe(self, jsonFilePath):
        '''
        Create dataframe from the partitionedJson files.

        '''
        output_filename = os.path.join(jsonFilePath)
        print('File being processed: ' + output_filename)
        print("--------------------------------------------")
        # print(output_filename)
        with open(output_filename, encoding='utf-8') as f:
            json_html = json.load(f)

        dic_json = {}
        #print(json_html)
        #print(type(json_html))
        for i in json_html['data']:
            for j in i.keys():
                dic_json = addjson(dic_json, j, i[j])

        # print(loadJSON_Convert_to_DF(output_filename))
        df = pd.DataFrame(dic_json)
        # print(df.shape)
        # display(df.head(5))
        return df

    def getRandomString(self,N):
        str_ = ''.join(random.choice(string.ascii_uppercase + string.digits
                                     + string.ascii_lowercase) for _ in range(N))
        return str_

    def parseTestHtml(self):
        
        basePath = os.path.join(os.path.abspath(os.path.join('..')),"testWork")
        controlBasePath = os.path.join(os.path.abspath(os.path.join('..')),'control')
        htmlDocName = "testDoc.html"
        domain = "H"
        procedureType = "CAP"
        languageCode = "el"
        fileNameQrd = 'qrd_canonical_model.csv'
        output_json_path = os.path.join(basePath)
        fileNameLog = os.path.join(basePath,'FinalLog.txt')
        logger = MatchLogger(f'TestParser_{self.getRandomString(1)}', "testDoc.html",
                             "H", "CAP", "el", "HTML", fileNameLog)

        styleLogger = MatchLogger(
            f'Style Dictionary_{self.getRandomString(1)}', htmlDocName, domain, procedureType, languageCode, "HTML", fileNameLog)

        styleRulesObj = StyleRulesDictionary(logger=styleLogger,
                                             controlBasePath=controlBasePath,
                                             language=languageCode,
                                             fileName=fileNameQrd,
                                             domain=domain,
                                             procedureType=procedureType
                                             )
        
        parserObj = parserExtractor(config, logger, styleRulesObj.styleRuleDict,
                                    styleRulesObj.styleFeatureKeyList,
                                    styleRulesObj.qrd_section_headings)
        

        output_filename = os.path.join(output_json_path, htmlDocName)
        style_filepath =  output_filename.replace('.html','.txt')
        style_filepath =  style_filepath.replace('.txtl','.txt')
        style_filepath =  style_filepath.replace('.htm','.txt')
        print("-------------",style_filepath,"-----------------")

        output_filename = output_filename.replace('.html', '.json')
        output_filename = output_filename.replace('.htm', '.json')
        input_filename = os.path.join(basePath, htmlDocName)
        print(input_filename, output_filename)
        parserObj.createPIJsonFromHTML(input_filepath=input_filename,
                                       output_filepath=output_filename,
                                       style_filepath = style_filepath,
                                       img_base64_dict=parserObj.convertImgToBase64(input_filename)
                                       )
        
        return parserObj, output_filename, style_filepath
        #return self.assertIn('parserExtractor', str(type(parserObj)))
        
    def createNewFeatureObj(self, styleFeatureKeyList, defaultValue = None):
        featureDict = {}
        for key in styleFeatureKeyList: 
            featureDict[key] = defaultValue
        return featureDict
    
    def extractFeaturesFromOutput(self, dfRow):
        
        styleFeatureKeyList = ['Bold', 'Italics', 'Uppercased', 'Underlined', 'Indexed', 'IsListItem', 'HasBorder']
        
        
        finalDict = self.createNewFeatureObj(styleFeatureKeyList)
        #print("Init",finalDict)
        for key in styleFeatureKeyList:
            finalDict[key] = dfRow[key][0]
        
        return dict(finalDict)
    
    def testFeaturesForTestHtmlElement(self, text, resultFeatures):
        
        parserObj, output_filename, style_filepath = self.parseTestHtml()
        df = self.createHtmlDataframe(output_filename)

        final = self.extractFeaturesFromOutput(df[df['Text'].str.contains(text)])
        
        return self.assertEqual(final,resultFeatures)
    
    def testFeaturesForHtmlElement(self, htmlString, parentFeatures, resultFeatures):
        
        soup = BeautifulSoup(htmlString, "html.parser")

        parserObj, output_filename, style_filepath = self.parseTestHtml()
        
        styleFeatureKeyList = ['Bold', 'Italics', 'Underlined']
        tagDict = {'Bold':'b', 'Italics':'i', 'Underlined': 'u'}
        finalFeatues = tagDict
        
        styleDataReader = open(style_filepath, 'r')
        css_in_style = BeautifulSoup(styleDataReader, "html.parser")

        css_in_style = parserObj.cleanCssString(str(css_in_style))
        class_style_dict = parserObj.parseClassesInStyle(css_in_style)

        for feature in styleFeatureKeyList:
            finalFeatues[feature] = parserObj.checkAllChildrenForFeature(soup, parentFeatures , tagDict[feature], class_style_dict)
         
        return self.assertEqual(finalFeatues,resultFeatures)

In [3]:
TestParserExtractor().testFeaturesForHtmlElement('<p class="EMEAHeading1"><span lang="EL">1.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </span><span lang="EN-GB">ONOMA</span><span lang="EL">Σ</span><span lang="EN-GB">IA</span><span lang="EN-GB"> </span><span lang="EN-GB">TOY</span><span lang="EL"> Φ</span><span lang="EN-GB">APMAKEYTIKOY</span><span lang="EL"> Π</span><span lang="EN-GB">PO</span><span lang="EL">Ϊ</span><span lang="EN-GB">ONTO</span><span lang="EL">Σ</span></p>',
                                                 {'Bold': True,
                                                 'Italics': False,
                                                 'Underlined': False},
                                                 {'Bold': True,
                                                 'Italics': False,
                                                 'Underlined': False})

2021-06-22 03:12:36,797 : Style Dictionary_r : Reading style dictionary in file: rule_dictionary_el.json | H | CAP |  el | HTML | testDoc.html
2021-06-22 03:12:36,833 : Style Dictionary_r : Qrd Section Keys Retrieved For Style Dictionary: ΠΕΡΙΛΗΨΗ ΤΩΝ ΧΑΡΑΚΤΗΡΙΣΤΙΚΩΝ ΤΟΥ ΠΡΟΪΟΝΤΟΣ, ΠΑΡΑΡΤΗΜΑ II, A. ΕΠΙΣΗΜΑΝΣΗ, B. ΦΥΛΛΟ ΟΔΗΓΙΩΝ ΧΡΗΣΗΣ | H | CAP |  el | HTML | testDoc.html


------------- D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.txt -----------------
D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.html D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.json


2021-06-22 03:12:37,054 : TestParser_S : Style Information Stored In File: D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.txt | H | CAP |  el | HTML | testDoc.html
2021-06-22 03:12:37,079 : TestParser_S : Writing to file: D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.json | H | CAP |  el | HTML | testDoc.html


In [4]:
TestParserExtractor().testFeaturesForTestHtmlElement('ΧΑΡΑΚΤΗΡΙΣΤΙΚΩΝ',{'Bold': True,
 'Italics': False,
 'Uppercased': True,
 'Underlined': False,
 'Indexed': False,
 'IsListItem': False,
 'HasBorder': False})

2021-06-22 03:12:39,196 : Style Dictionary_r : Reading style dictionary in file: rule_dictionary_el.json | H | CAP |  el | HTML | testDoc.html
2021-06-22 03:12:39,196 : Style Dictionary_r : Reading style dictionary in file: rule_dictionary_el.json | H | CAP |  el | HTML | testDoc.html
2021-06-22 03:12:39,233 : Style Dictionary_r : Qrd Section Keys Retrieved For Style Dictionary: ΠΕΡΙΛΗΨΗ ΤΩΝ ΧΑΡΑΚΤΗΡΙΣΤΙΚΩΝ ΤΟΥ ΠΡΟΪΟΝΤΟΣ, ΠΑΡΑΡΤΗΜΑ II, A. ΕΠΙΣΗΜΑΝΣΗ, B. ΦΥΛΛΟ ΟΔΗΓΙΩΝ ΧΡΗΣΗΣ | H | CAP |  el | HTML | testDoc.html
2021-06-22 03:12:39,233 : Style Dictionary_r : Qrd Section Keys Retrieved For Style Dictionary: ΠΕΡΙΛΗΨΗ ΤΩΝ ΧΑΡΑΚΤΗΡΙΣΤΙΚΩΝ ΤΟΥ ΠΡΟΪΟΝΤΟΣ, ΠΑΡΑΡΤΗΜΑ II, A. ΕΠΙΣΗΜΑΝΣΗ, B. ΦΥΛΛΟ ΟΔΗΓΙΩΝ ΧΡΗΣΗΣ | H | CAP |  el | HTML | testDoc.html


------------- D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.txt -----------------
D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.html D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.json


2021-06-22 03:12:39,454 : TestParser_C : Style Information Stored In File: D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.txt | H | CAP |  el | HTML | testDoc.html
2021-06-22 03:12:39,470 : TestParser_C : Writing to file: D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.json | H | CAP |  el | HTML | testDoc.html


File being processed: D:\Projects\EMA\Repository\EMA EPI PoC\function_code\testWork\testDoc.json
--------------------------------------------
