## The File for generating HTML tags

In [1]:
from wand.image import Image as wi
import os; from io import BytesIO; import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import time

from tesserocr import PyTessBaseAPI, RIL, iterate_level, PT, OEM

In [2]:
filepath = "../datafiles/dfile.pdf"

In [2]:
pytestapi_path = "C:\\Tesseract\\Tesseract-OCR-v5\\tessdata"

### pdf to Images List

In [5]:
def pdf2ImagesList(filepath):
    pdf = wi(filename = filepath, resolution=800, background = 'white')
    numPages = len(pdf.sequence)   #should be a class prop
    pdfImages = pdf.convert("jpeg")
    
    pageImages = []   #should be a class prop
    for img in pdfImages.sequence:
        page = wi(img)
        PIL_img = Image.open(BytesIO(page.make_blob('jpeg')))
        if(PIL_img.mode != 'RGB'):
            pageImages.append(PIL_img.convert('RGB'))
        else:
            pageImages.append(PIL_img)
    
    return pageImages

In [6]:
filepageImages = pdf2ImagesList(filepath)

In [7]:
len(filepageImages)

4

### Noise calculation [per page]

In [47]:
def get_noise_val_Image(imagePIL, NoiseThreshold_ppage): # NoiseThreshold_ppage is b/w 0 and 1
    '''
    Params:
        imagePIL - PIL image object
        NoiseThreshold_ppage - %age noise threshold for an image range [0, 1]
        
    Returns:
        ratio of dark pixels and total pixels
        boolean page acceptance result based on noise ratio
        binary image with 0 representing noise pixels
    '''
    
    BinaryThreshold = 195  ## threshold per pixel 
    
    grayPIL = imagePIL.convert('L')
    imageNUMPY = np.array(grayPIL, dtype='uint8')
    
    Tessstart = time.time()
    
    with PyTessBaseAPI(path = "C:\\Tesseract\\Tesseract-OCR-v3\\tessdata") as api:
        api.SetImage(imagePIL)
        api.Recognize()
        ri = api.GetIterator()
        level = RIL.TEXTLINE
        for r in iterate_level(ri, level):
            x1, y1, x2, y2 = r.BoundingBox(level)
            imageNUMPY[y1:y2, x1:x2] = 255
    
    bin_ = np.zeros(imageNUMPY.shape)
    bin_ = np.where(imageNUMPY > BinaryThreshold, 1, 0)
    
    pixels_num = bin_.shape[0]*bin_.shape[1]
    ratio = (pixels_num - bin_.sum())/pixels_num
    
    return ratio, ratio > NoiseThreshold_ppage, bin_ #returns the ratio, rejection result, binary_Image

In [48]:
def get_noise_params_PDF(PDFImages, NoiseThreshold_ppage  = 0.50):
    ''' IN-PROGRESS
    PDFImages: r x c x ch x num where num is number of 
    '''
    PAGESCOUNT = len(PDFImages)
    pageNoiseFlag = [0]*PAGESCOUNT   #should be a class prop

    FORstart = time.time()
    
    for i in range(0, len(PDFImages)):
        #ITstart = time.time()
        if(get_noise_val_Image(PDFImages[i], NoiseThreshold_ppage)[1]): #rejection means: page is noisy
            pageNoiseFlag[i] = 1
        #print("Iteration-Time taken", time.time() - ITstart)
    
    print("FOR EndTime taken", time.time() - FORstart)
    
    return   sum(pageNoiseFlag)/PAGESCOUNT, pageNoiseFlag #returns the %age of pages that are noisy [0, 1]

In [68]:
noiseratio, noise_bin = get_noise_params_PDF(filepageImages)

FOR EndTime taken 17.110139846801758


In [69]:
print("Ratio:\t", noiseratio, "\t\tList:\t", noise_bin)

Ratio:	 0.0 		List:	 [0, 0, 0, 0]


### OCR SCAN
* Images to Text Conversion 
* Table Detection (Table detection does happens in the java code but doesn't go in the grain)
     * TH , TR , TD
* Background Detection
    * Specific Line Background
    * Word Level, Block Level & Line Level (Under Progress)
    
* Font Characterstics
    * Font Density ( Tried to do in jar but was inefficent, insufficent and inaccurate)
        * Bold 
        * Light
    * Font Size
* Hand Written Detection (ITS UNDER PROGRESS)
    * HandWritten Model vs Digital Text Model
    * Evaluation of Metrices
    
     
      

In [3]:
def line_bg_detection(bbox, numBIN, NUMPYimage):
    if bbox:
        x1, y1, x2, y2 = bbox
        colorbin = np.ones((numBIN, 3), dtype = 'int16') *-1

        combs = np.where(NUMPYimage[y1:y2, x1:x2, 0] != -1) # Check if the image didn't had the -1 there
        if (combs[0].size != 0): # check if the size is not zero 
            bin_i = 0
            # Choose randomly bins (20 is the default set right now )
            # To check the image or the bin or whatsoever 
            for i in np.random.choice(range(combs[0].shape[0]), numBIN):
                x = combs[1][i]
                y = combs[0][i]
                colorbin[bin_i] = NUMPYimage[y1+y, x1+x, :]
                bin_i += 1
            # Historgram 

            hist = {}
            for r, g, b in colorbin:
                if not (r == -1):
                    RGB = str(r)+"_"+str(g)+"_"+str(b)
                    if RGB in list(hist.keys()):
                        hist[RGB] += 1
                    else:
                        hist[RGB] = 1
            if len(list(hist.keys())):
                return list(hist.keys())[list(hist.values()).index(max(list(hist.values())))].split("_") ## RGB value in list
    return 0

In [19]:
def get_table_row_tags(api, bbox):
    ri = api.GetIterator()
    level = RIL.TEXTLINE
    ri = api.GetIterator()
    
    buffer = 15
    
    lines = "<table>\n"
    
    for rl in iterate_level(ri, level):
        if rl:
            tmp = rl.GetUTF8Text(level)
            if (tmp.rstrip()):
                c_x1, c_y1, c_x2, c_y2  = rl.BoundingBox(level)
                if (c_y1 >= (bbox[1] - buffer) and c_y2 <= (bbox[3] + buffer)):
                    lines += "<tr>" + rl.GetUTF8Text(level) + "\n</tr>\n"
                if (c_y2 > (bbox[3] + buffer)):
                    break
    lines += "</table>\n"
    print(lines)
    return lines

In [52]:
filepageImages[2].show()

In [26]:
IMAGEPATH = '../datafiles/PDT-3.png'
page = Image.open(IMAGEPATH)

In [27]:
if page.mode != 'RGB':
    newpage = page.convert('RGB')
else:
    newpage = page.copy()

In [28]:
FileTextBuffer = ""
FileTextBuffer += "<document>\n"

pagecounter  = 1
#for page in filepageImages[2:3]:

print(pagecounter)
thisPageText = ""
thisPageText += "<page" +str(pagecounter)+">\n"

with PyTessBaseAPI(path = pytestapi_path) as api:
    api.SetImage(page)
    api.Recognize()
    PILimage = page.copy()
    NUMPYimage = np.array(newpage, dtype='int16')

    ## Preprosessing for background detection --START
    ri = api.GetIterator()
    level = RIL.WORD
    # Word by word iterator 
    for r in iterate_level(ri, level):
        if r:
            bbox = r.BoundingBox(level)
            #print(bbox)
            if bbox: ## Now black (-1) 'em out
                # Word_BBoxes.append(bbox)
                # print(bbox)
                #print(bbox)
                #print(NUMPYimage.shape)
                NUMPYimage[bbox[1]:bbox[3], bbox[0]:bbox[2], :] = -1
    ## Preprosessing for background detection --END

    ri = api.GetIterator()
    level = RIL.BLOCK # Block based Values 

    # Table is detectable ( we can get the verticle and horizental Lins )
    for r in iterate_level(ri, level):
        if r:
            block_type = r.BlockType() # Type of that specific block
            #print(block_type)
            print(r.GetUTF8Text(level))
            #print(block_type, tmp)
            if (block_type == PT.TABLE):
                #print("yes")
                img = r.GetBinaryImage(level)
                img.show()
                text_rows = get_table_row_tags(api, r.BoundingBox(level))
                thisPageText += text_rows
            elif(block_type not in [PT.UNKNOWN, PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE, PT.HORZ_LINE, PT.VERT_LINE, PT.NOISE]):
                ###  Backgrpound Color algo -START Params (bbox, numBIN, NumpyImage)
                rgb_result = line_bg_detection(r.BoundingBox(level), 20, NUMPYimage)
                ###  Backgrpound Color algo -END
                if (rgb_result):
                    thisPageText += "<text font_bg = '" + rgb_result[0] + "," + rgb_result[1] + "," + rgb_result[2] +"'>\n" + r.GetUTF8Text(level) + "</text>\n"
                else:
                    thisPageText += "<text>\n" + r.GetUTF8Text(level) + "</text>\n"

FileTextBuffer += thisPageText + "</page" +str(pagecounter)+">\n"
pagecounter += 1

FileTextBuffer += "</document>\n"

1
PAMC PROVIDENCE ALASKA MEDICAL CENTER

3200 Providence Dr MRN: 01275812
Anchorage AK 99508-4615 PO

Adm: 12/7/2016, D/C: 12/18/2016


Procedure Notes (continued)


Procedures by Aris M Sophocles, MD at 12/7/2016 12:24 (continued)


Ventricle Size Findings Global function EF
(decrease)

Right normal none none 45-55%
moderate, Anterior

Left normal none wall severe 35-45%
hypokinesis


<table>
<tr>Ventricle Size Findings Global function EF

</tr>
<tr>(decrease)


</tr>
<tr>Right normal none none 45-55%

</tr>
<tr>moderate, Anterior


</tr>
<tr>Left normal none wall severe 35-45%

</tr>
<tr>hypokinesis


</tr>
</table>

Other Findings: Regional wall motion abnormalities


Post-Intervention Follow-up Study


Aorta Findings: No aorta changes


Global Regional
Ventricular | decreased, (worsened Anterior and apical
function apical hypokinesis) hypokinesis
Type Normal Repair Residual
function? insufficiency
Valve native no unchanged | unchanged


<table>
<tr>Global Regional

</tr>
<tr>Ventri

In [29]:
print(FileTextBuffer)

<document>
<page1>
<text>
PAMC PROVIDENCE ALASKA MEDICAL CENTER

3200 Providence Dr MRN: 01275812
Anchorage AK 99508-4615 PO

Adm: 12/7/2016, D/C: 12/18/2016

</text>
<text>
Procedure Notes (continued)

</text>
<text>
Procedures by Aris M Sophocles, MD at 12/7/2016 12:24 (continued)

</text>
<table>
<tr>Ventricle Size Findings Global function EF

</tr>
<tr>(decrease)


</tr>
<tr>Right normal none none 45-55%

</tr>
<tr>moderate, Anterior


</tr>
<tr>Left normal none wall severe 35-45%

</tr>
<tr>hypokinesis


</tr>
</table>
<text>
Other Findings: Regional wall motion abnormalities

</text>
<text>
Post-Intervention Follow-up Study

</text>
<text>
Aorta Findings: No aorta changes

</text>
<table>
<tr>Global Regional

</tr>
<tr>Ventricular | decreased, (worsened Anterior and apical

</tr>
<tr>function apical hypokinesis) hypokinesis

</tr>
<tr>Type Normal Repair Residual

</tr>
<tr>function? insufficiency

</tr>
<tr>Valve native no unchanged | unchanged


</tr>
</table>
<text>
Comments: G

In [90]:
with open("./output-DOCT.xml", 'w+',encoding="utf-8") as f:
    f.write(FileTextBuffer)

In [92]:
FileTextBuffer = ""
FileTextBuffer += "<document>\n"

pagecounter  = 1
for page in filepageImages:
    print(pagecounter)
    thisPageText = ""
    thisPageText += "<page" +str(pagecounter)+">\n"
    
    with PyTessBaseAPI(path = pytestapi_path) as api:
        api.SetImage(page)
        api.Recognize()
        PILimage = page.copy()
        NUMPYimage = np.array(page, dtype='int16')
        
        ## Preprosessing for background detection --START
        ri = api.GetIterator()
        level = RIL.WORD
        # Word by word iterator 
        for r in iterate_level(ri, level):
            if r:
                bbox = r.BoundingBox(level)
                #print(bbox)
                if bbox: ## Now black (-1) 'em out
                    # Word_BBoxes.append(bbox)
                    # print(bbox)
                    #print(bbox)
                    #print(NUMPYimage.shape)
                    NUMPYimage[bbox[1]:bbox[3], bbox[0]:bbox[2], :] = -1
        ## Preprosessing for background detection --END
        
        ri = api.GetIterator()
        level = RIL.BLOCK # Block based Values 

        # Table is detectable ( we can get the verticle and horizental Lins )
        for r in iterate_level(ri, level):
            if r:
                block_type = r.BlockType() # Type of that specific blocl
                tmp = r.GetUTF8Text(level)

                #print(r.ParagraphInfo())
                
                #print(block_type, tmp)
                if (block_type is PT.TABLE) or  (block_type == PT.FLOWING_IMAGE):
                    #print("yes")
                    # table_image = r.GetImage(level, 5, PILimage)
                    
                    # if table_image:
                    #    text_rows = get_table_row_tags(table_image[0])
                    #    #print(text_rows)
                    #   if(text_rows):
                    #        thisPageText += text_rows
                    tmp_text
                elif((block_type != PT.UNKNOWN) and (tmp.rstrip() != "")):
                    ###  Backgrpound Color algo -START Params (bbox, numBIN, NumpyImage)
                    rgb_result = line_bg_detection(r.BoundingBox(level), 20, NUMPYimage)
                    ###  Backgrpound Color algo -END
                    if (rgb_result):
                        thisPageText += "<text font_bg = '" + rgb_result[0] + "," + rgb_result[1] + "," + rgb_result[2] +"'>\n" + r.GetUTF8Text(level) + "</text>\n"
                    else:
                        thisPageText += "<text>\n" + r.GetUTF8Text(level) + "</text>\n"
    FileTextBuffer += thisPageText + "</page" +str(pagecounter)+">\n"
    pagecounter += 1

1
2
3
4


In [None]:
##