## The File for generating HTML tags

In [1]:
import os; from io import BytesIO; import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import time
import re

from tesserocr import PyTessBaseAPI, RIL, iterate_level, PT, OEM, Justification

In [2]:
directory = "..\\datafiles\\"

In [9]:
def pdf2folderimages (dir_path, file):
'''
    This function is used to convert pdf file to images in png format
@params:
    dir_path - directory path which contains the pdf files :str
    file - pdf file name :str
@return:
    Path of folder which contains images of input pdf file, or False on failure :str/bool
'''
    newfolderpath = dir_path + file.split('.')[0] #make path of new folder where output files will be saved
    try:
        new_dir_cmd = "mkdir "+ newfolderpath  #create new folder
        os.system(new_dir_cmd) #run above command
        
        sys_cmd = "convert -units PixelsPerInch -density 300 " + dir_path+file +" "+ newfolderpath+"\page.png" 
        os.system(sys_cmd) #run above command for pdf to image files conversion
        
        return newfolderpath #return folder path
    except:
        return False

In [None]:
start = time.time()
FOLDERpath = pdf2folderimages(directory, "doc129.pdf") #convert pdf file to images function call
tmp_time = time.time() - start
print(tmp_time)

In [3]:
print("Average Time:\t\t", tmp_time/129)

NameError: name 'tmp_time' is not defined

In [5]:
pytestapi_path = "C:\\Tesseract\\Tesseract-OCR-v5\\tessdata"

### OCR SCAN
* Images to Text Conversion 
* Table Detection (Table detection does happens in the java code but doesn't go in the grain)
     * TH, TR
* Background Detection
    * Specific Line Background
    * Word Level, Block Level & Line Level (Under Progress)
    
* Font Characterstics
    * Font Density ( Tried to do in jar but was inefficent, insufficent and inaccurate)
        * Bold 
        * Light
    * Font Size
* Hand Written Detection (ITS UNDER PROGRESS)
    * HandWritten Model vs Digital Text Model
    * Evaluation of Metrices
    
     
      

In [9]:
def line_bgColorPreProcess(PILimage, tessAPI):
    '''
    @params:
        PILimage - Page Pillow image object :PILJpeg
        tessAPI - The Tesserocr api referance to get itterator for textlines: PyTessBaseAPI
    @return:
        NumpyArray of image with -1 on pixels where word was found
    '''
    if (PILimage.mode != 'RGB'): #check if image not in RGB then make it one: for bg color detection
        NUMPYimage = np.array(PILimage.convert('RGB'), dtype='int16')
    else:
        NUMPYimage = np.array(PILimage, dtype='int16')

    ri = tessAPI.GetIterator()
    level = RIL.WORD
    # Word by word iterator 
    for r in iterate_level(ri, level):
        if r:
            bbox = r.BoundingBox(level)
            if bbox: ## Now black (-1) 'em out
                NUMPYimage[bbox[1]:bbox[3], bbox[0]:bbox[2], :] = -1
    return NUMPYimage

In [1]:
def line_bg_detection(bbox, numBIN, NUMPYimage):
    '''
    Function used to get line background based on input bounding box
    @params:
    bbox - The bounding box of line/bbox for which background color is to be extracted :tuple
    numBIN - The number of samples based on which the decision will be made :int
    NUMPYimage - image with -1 on pixels where word was found :ndarray
    
    @returns:
        list with rgb color / zero (int)
    '''
    if bbox:
        x1, y1, x2, y2 = bbox
        colorbin = np.ones((numBIN, 3), dtype = 'int16') *-1 #initializing the decison colorbins

        combs = np.where(NUMPYimage[y1:y2, x1:x2, 0] != -1) # Check if the image didn't had the -1 there
        if (combs[0].size != 0): # check if the size is not zero 
            bin_i = 0 #initialize the bin number index to iterate
            # Choose randomly bins (20 is the default set right now )
            # To check the image or the bin or whatsoever
            for i in np.random.choice(range(combs[0].shape[0]), numBIN): #randomly choose numBIN pixels and iterate
                x = combs[1][i]
                y = combs[0][i]
                colorbin[bin_i] = NUMPYimage[y1+y, x1+x, :]
                bin_i += 1
            
            # Histogram making for color choice 
            hist = {}
            for r, g, b in colorbin: #itterate over bins
                if not (r == -1): #if color bin is not empty
                    RGB = str(r)+"_"+str(g)+"_"+str(b)
                    if RGB in list(hist.keys()): #if RGB already in histogram dictionary
                        hist[RGB] += 1
                    else:
                        hist[RGB] = 1
                        
            if len(list(hist.keys())): #if histogram is not empty
                return list(hist.keys())[list(hist.values()).index(max(list(hist.values())))].split("_") ## RGB value in list
    return 0

In [7]:
def get_table_row_tags(api, bbox):
'''
    Table tags generation code.
    @Params:
        api - The Tesserocr api referance to get itterator for textlines: PyTessBaseAPI
        bbox - table bounding box: tuple
    @returns:
        text with table tags appended : str
'''    
    ri = api.GetIterator()
    level = RIL.TEXTLINE
    ri = api.GetIterator()
    
    buffer = 0
    
    lines = "<table>\n"
    
    for rl in iterate_level(ri, level):
        if rl:
            tmp = rl.GetUTF8Text(level)
            if (tmp.rstrip()): # if text is not empty
                c_x1, c_y1, c_x2, c_y2  = rl.BoundingBox(level)
                if (c_y1 >= (bbox[1] - buffer) and c_y2 <= (bbox[3] + buffer)): #if textline is inside table bounding box
                    lines += "<tr>" + rl.GetUTF8Text(level) + "\n</tr>\n"
                if (c_y2 > (bbox[3] + buffer)): #if bounding box ending ending exceeds table bbox
                    break
    lines += "</table>\n"
    # print(lines)
    return lines

In [8]:
def sorted_nicely(l):
    """ Sort the given iterable in the way that humans expect."""
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)

In [10]:
def word_density_map(TessAPI):
    '''
    @params:
        tessAPI - The Tesserocr api referance to get itterator for textlines: PyTessBaseAPI
    @returns:
        dictionary containing word, density, bounding box and characteristic key arrays :dict
    '''
    
    ri = TessAPI.GetIterator()
    level = RIL.WORD

    result = {}
    result['word'] = []
    result['density'] = []
    result['bbox'] = []
    result['characteristic'] = []
    for r in iterate_level(ri, level):
        if r:
            tmpword = r.GetUTF8Text(level)
            if (tmpword.rstrip()):
                img = r.GetBinaryImage(level)

                img_sh = np.asarray(img)
                denom = img_sh.size

                result['word'].append(r.GetUTF8Text(level))
                result['bbox'].append(r.BoundingBox(level))
                result['density'].append(1- (img_sh.sum()/(denom*255)))
                result['characteristic'].append("")
    return result

In [11]:
def wordsinBlock(allWords, bbox):
    '''
    returns words that that belong to the block specified by bbox
    @params:
        allWords - the dictionary of all words, returned by word_density_map function :dict
        bbox - Tuple of block bounding box :dict
    @returns:
        dictionary containing word, density, bounding box and characteristic key arrays :dict
    '''
    
    buffer = 5
    thekeys = list(allWords.keys())
    ThisWords = {}
    for key in thekeys:
        ThisWords[key] = []
        
    for index in range(0, len(allWords['density'])):
        x1, y1, x2, y2 = allWords['bbox'][index]
        if (y1 >= (bbox[1] - buffer) and y2 <= (bbox[3] + buffer)):
            ThisWords['word'].append(allWords['word'][index])
            ThisWords['bbox'].append(allWords['bbox'][index])
            ThisWords['density'].append(allWords['density'][index])
            ThisWords['characteristic'].append(allWords['characteristic'][index])
    return ThisWords

In [12]:
def is_Bold(wordDenisty, pageWordsDensityList, percentageThreshold):
    '''
    @params:
        wordAttrbs - density :float
        pageWordsDensityList - list of densities from page :list
        percentageThreshold - X% threshold :float
    @returns;
        word is bold not not: bool
    '''
    np_array = np.array(pageWordsDensityList)
    percentageBelow = np.where(np_array < wordDenisty)[0].size / np_array.size
    
    return percentageBelow > percentageThreshold

In [13]:
def putBoldChar (BlockWords, allPageWords, BoldThreshold):
    
    for index in range(0, len(BlockWords['density'])):
        if BlockWords['characteristic'][index]:
            continue
        else:
            if(is_Bold(BlockWords['density'][index], allPageWords['density'], BoldThreshold)):
                BlockWords['characteristic'][index] = "bold"
    return BlockWords

In [14]:
def BlockWordsToText(wordsDictionaryWithChars):
    newText = ""
    for index in range(0, len(wordsDictionaryWithChars['word'])):
        if(wordsDictionaryWithChars['characteristic'][index] is "bold"):
            newText += " <bold> "+ wordsDictionaryWithChars['word'][index] +" </bold>"
        else:
            newText += wordsDictionaryWithChars['word'][index]+" "
    return newText

In [15]:
def generate_HTML (FOLDERpath):
    '''
    params:
    FOLDERpath --> path of directoray which contain all the MR document images
    returns:
    HTML format string containing result of OCR scan    
    '''
    
    FileTextBuffer = ""
    FileTextBuffer += "<document>\n"

    filesList = os.listdir(FOLDERpath)
    
    pagecounter  = 1
    for filename in sorted_nicely(filesList):
        start = time.time()
        page = Image.open(FOLDERpath+"\\"+filename) #readimage in PIL format
        print("MR chart '", FOLDERpath+"\\"+filename, "' is under-process")
        
        thisPageText = ""
        thisPageText += "<page" +str(pagecounter)+">\n"

        with PyTessBaseAPI(path = pytestapi_path) as api:
            api.SetImage(page)
            api.Recognize()
            
            ## Preprosessing for background detection --START
            NUMPYimage = line_bgColorPreProcess(page, api)
            WordsDensityDict = word_density_map(api)
            ## Preprosessing for background detection --END

            ri = api.GetIterator()
            level = RIL.BLOCK # Block based Values 

            # Table is detectable ( we can get the verticle and horizental Lins )
            for r in iterate_level(ri, level):
                if r:
                    block_type = r.BlockType() # Type of that specific block
                    # print(r.GetUTF8Text(level))
                    just = r.ParagraphInfo()[0]
                    #print(r.GetUTF8Text(level), " ### ", just, " ******", just == Justification.RIGHT)
                    
                    if (block_type == PT.TABLE):
                        img = r.GetBinaryImage(level)
                        # img.show() to show cropped table image
                        text_rows = get_table_row_tags(api, r.BoundingBox(level))
                        thisPageText += text_rows
                    elif(block_type not in [PT.UNKNOWN, PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE, PT.HORZ_LINE, PT.VERT_LINE, PT.NOISE]):
                        ###  Backgrpound Color algo -START Params (bbox, numBIN, NumpyImage)
                        rgb_result = line_bg_detection(r.BoundingBox(level), 20, NUMPYimage)
                        ###  Backgrpound Color algo -END
                        
                        ### Bold
                        wordsinThisBlock = wordsinBlock(WordsDensityDict, r.BoundingBox(level))
                        wordsinThisBlock = putBoldChar(wordsinThisBlock, WordsDensityDict, 0.8)
                        blockText = BlockWordsToText(wordsinThisBlock)
                        #print(blockText)
                        
                        if (rgb_result):
                            thisPageText += "<text font_bg = '" + rgb_result[0] + "," + rgb_result[1] + "," + rgb_result[2] +"'>\n" + blockText + "</text>\n"
                        else:
                            thisPageText += "<text>\n" + blockText + "</text>\n"

        FileTextBuffer += thisPageText + "</page" +str(pagecounter)+">\n"
        pagecounter += 1
        print("Page Time:\t\t", time.time() - start)
        
    FileTextBuffer += "</document>\n"
    
    return FileTextBuffer

In [16]:
FOLDERpath = '..\\datafiles\\doc22'

In [17]:
COST = time.time()
GeneratedText = generate_HTML(FOLDERpath)
tmp_time = time.time() - COST
print("Time on Document:\t", tmp_time)

MR chart ' ..\datafiles\doc22\doc22-0.png ' is under-process
Page 1 of 11 
 <bold> mae </bold> <bold> cet </bold> <bold> #: </bold>From Visit On: 04/11/16  <bold> Office </bold> <bold> Visit </bold>
 <bold> 4/11/2016 </bold> <bold> Pe </bold> <bold> | </bold> <bold> MRN: </bold> <bold> 1388862 </bold>
 <bold> Reason </bold>for Visit  <bold> Heme/Onc </bold> <bold> Care </bold>
21.38 kg/m2 94% 
 <bold> Progress </bold> <bold> Notes </bold>Sherin Fetouh,  <bold> MD </bold> <bold> at </bold>04/11/16 1040 Status: Signed Date of Service: 4/11/2016 
 <bold> Subjective: </bold>
 <bold> Past </bold> <bold> History: </bold>He has chemotherapy induced polyneuropathy due  <bold> to </bold>oxalipatin and hand and foot syndrome due  <bold> to </bold>
capecitabine.He received in addition epirubicin. He  <bold> was </bold>diagnosed with adenocarcinoma of  <bold> GE </bold>junction, T3N2 with removal of 30+ nodes and Roux  <bold> en </bold>Y surgery March 2013 Mayo Clinic. They diagnosed the hand  <bo

  "Palette images with Transparency expressed in bytes should be "


Page 6 of 11 
&/FASCIA 20 SQ CM/< [11044] 
Family  <bold> Status </bold>**None** 
Spanish Origin 
 <bold> PL </bold>(MR # 1388862) Printed at 6/6/17 12:43 PM Page 6 of 11 
Page Time:		 2.898977756500244
MR chart ' ..\datafiles\doc22\doc22-6.png ' is under-process
Page 7 of 11 
Hospitalizations  <bold> 7 </bold> <bold> as </bold> <bold> of </bold> <bold> 4/11/2016 </bold>Past Hospitalizations? 
 <bold> Additional </bold>History 
History Last reviewed in  <bold> this </bold>visit by  <bold> Erica </bold> <bold> Newton </bold> <bold> on </bold>4/11/2016  <bold> at </bold>10:44  <bold> AM </bold>Sections Reviewed Tobacco 
History Last  <bold> Reviewed </bold>by  <bold> D'Mesha </bold>Jefferson,  <bold> MA </bold> <bold> on </bold> <bold> 5/10/2016 </bold> <bold> at </bold>10:34  <bold> AM </bold>
Tobacco  <bold> use </bold>
Last reviewed 5/10/2016 
Visit Diagnoses  <bold> and </bold>Associated Orders 
Neuropathy due  <bold> to </bold>drugs (HCC)  <bold> - </bold>Primary ICD-9-CM: 357.6 ICD

In [15]:
print("Average time:\t\t", tmp_time/52)

Average time:		 2.667533966211172


In [20]:
print(GeneratedText)

<document>
<page1>
<text font_bg = '255,255,255'>
Page 1 of 11

</text>
<text font_bg = '255,255,255'>
mae
cet #:

From Visit On: 04/11/16
Office Visit

</text>
<text font_bg = '244,250,255'>
4/11/2016 Pe | MRN: 1388862

</text>
<text font_bg = '255,255,255'>
Reason for Visit
Heme/Onc Care

</text>
<table>
<tr>New Patient


</tr>
<tr>Reason for Visit History

</tr>
<tr>AVS Reports


</tr>
<tr>Date/Time Report Action User


</tr>
<tr>4/11/2016 12:09 After Visit Summary Printed Michelle Norris


</tr>
<tr>PM


</tr>
<tr>Most recent update: 4/11/2016 10:44 AM by


</tr>
<tr>Vital Signs Erica Newton


</tr>
<tr>BP Pulse Temp(Src) Resp Ht Wt


</tr>
<tr>428/102 98 36.9 °C (98.4 °F) 20 185.4 cm (72.99") 73.483 kg (162 Ib)


</tr>
<tr>mmHg (Oral)


</tr>
<tr>BMI SpO2


</tr>
</table>
<text font_bg = '255,255,255'>
21.38 kg/m2 94%

</text>
<text font_bg = '255,255,255'>
Progress Notes
Sherin Fetouh, MD at 04/11/16 1040
Status: Signed
Date of Service: 4/11/2016

</text>
<text>
Subjective:

</te

In [18]:
with open('doc22-bold1.xml', 'w+') as f:
    f.write(GeneratedText)

In [3]:
drr = [1,2,3,5, 6, 7]

In [None]:
def change (dc):
    

In [4]:
del drr[3]

In [5]:
drr

[1, 2, 3, 6, 7]