## The File for generating HTML tags

In [1]:
from wand.image import Image as wi
import os; from io import BytesIO; import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import time
import re
import pickle as pk
from nltk.tokenize import word_tokenize

from tesserocr import PyTessBaseAPI, RIL, iterate_level, PT, OEM

In [2]:
with open("complete_dict.pkl", 'rb') as f:
    dictionary = pk.load(f)

In [3]:
pytestapi_path = "C:\\Tesseract\\Tesseract-OCR-v5\\tessdata"

In [4]:
len(dictionary)

629526

In [5]:
def sorted_nicely(l):
    """ Sort the given iterable in the way that humans expect."""
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)

In [6]:
def word_tokenizer(text):
    #expressions = ["[^a-z]+", ""]
    tokens = re.findall("[a-z]+", text)
    return tokens

In [7]:
def get_words_dict_counts(tess_word):
    '''
    params:
    string text
    returns:
    words count
    words count, not in dictionary
    list of words in input strring
    '''
    words_count = 0
    wordsNotInDict = 0
    Words = ""
    
    lower_words = word_tokenizer(tess_word)
    for word in lower_words:
        words_count += 1
        Words += word + "\n"
        if word not in dictionary:  #assumption that dictionary is global variuable containing all dictionary words in alist  
            wordsNotInDict += 1
    
    return words_count, wordsNotInDict, Words

In [8]:
def get_dirt_from_page(filepath):
    tmpTotal_words = 0
    tmpNot_in_dict = 0
    tmpWords = ""
    
    page = Image.open(filepath) #readimage in PIL format
    print("MR chart '", filepath, "' is under-process")
    DICT_time = 0
    
    with PyTessBaseAPI(path = pytestapi_path) as api:
        api.SetImage(page)
        api.Recognize()
        ## Preprosessing for background detection --START
        ri = api.GetIterator()
        level = RIL.WORD
        
        # Word by word iterator 
        for r in iterate_level(ri, level):
            if r:
                s = time.time()
                tmp = r.GetUTF8Text(level)
                if (tmp.rstrip()):
                    w_c, wd_c, w = get_words_dict_counts(r.GetUTF8Text(level).lower())
                    tmpTotal_words += w_c
                    tmpNot_in_dict += wd_c
                    tmpWords += w
                DICT_time += time.time() - s
                
    print("Dictionary check time:\t", DICT_time)
    return tmpNot_in_dict, tmpTotal_words, tmpWords

In [9]:
def generate_dirt_params (FOLDERpath):
    '''
    params:
    FOLDERpath --> path of directoray which contain all the MR document images
    returns:
    - total number of words, not in dictionar 
    - total number of words 
    - the words list from file 
    - ratio of dirt on overall file
    - list of dirt ratio per page
    '''
    
    WordsofPages = ""
    WordsofPages += "<document>\n"
    total_words = []
    not_in_dict = []
    
    pagecounter  = 1
    for filename in sorted_nicely(os.listdir(FOLDERpath)):
        print(pagecounter)
        start = time.time()
        tmpNot_in_dict, tmpTotal_words, tmpWords = get_dirt_from_page(FOLDERpath+"\\"+filename)
        
        WordsofPages += "<page"+str(pagecounter)+">\n"
        WordsofPages += tmpWords
        
        print("Page Time:\t\t", time.time() - start)
        
        WordsofPages += "<\page"+str(pagecounter)+">\n"
        pagecounter += 1
        
        total_words.append(tmpTotal_words)
        not_in_dict.append(tmpNot_in_dict)
        
    WordsofPages += "</document>\n"
    
    ratio_ppage = [n/c for n, c in zip(not_in_dict, total_words)]
    
    return not_in_dict, total_words, WordsofPages, sum(not_in_dict)/sum(total_words), ratio_ppage

In [10]:
FOLDERpath = '..\\datafiles\\doc22'

In [12]:
COST = time.time()
not_, total, docWords, ratio, ppage_ratio = generate_dirt_params(FOLDERpath)
tmp_time = time.time() - COST
print("Time on Document:\t", tmp_time)

1
MR chart ' ..\datafiles\doc22\doc22-0.png ' is under-process
Dictionary check time:	 1.2586917877197266
Page Time:		 4.748321056365967
2
MR chart ' ..\datafiles\doc22\doc22-1.png ' is under-process
Dictionary check time:	 0.38700079917907715
Page Time:		 2.266535997390747
3
MR chart ' ..\datafiles\doc22\doc22-2.png ' is under-process
Dictionary check time:	 1.223982810974121
Page Time:		 4.62907862663269
4
MR chart ' ..\datafiles\doc22\doc22-3.png ' is under-process
Dictionary check time:	 0.7300629615783691
Page Time:		 3.6441729068756104
5
MR chart ' ..\datafiles\doc22\doc22-4.png ' is under-process
Dictionary check time:	 0.7539982795715332
Page Time:		 3.4940624237060547
6
MR chart ' ..\datafiles\doc22\doc22-5.png ' is under-process
Dictionary check time:	 0.5434832572937012
Page Time:		 2.8138577938079834
7
MR chart ' ..\datafiles\doc22\doc22-6.png ' is under-process
Dictionary check time:	 0.4548206329345703
Page Time:		 2.0543839931488037
8
MR chart ' ..\datafiles\doc22\doc22-

In [15]:
print("Average time:\t\t", tmp_time/52)

Average time:		 2.667533966211172


In [45]:
print(GeneratedText)

<document>
<page1>
<text font_bg = '255,255,255'>
Page 1 of 11

</text>
<text font_bg = '255,255,255'>
mae
cet #:

From Visit On: 04/11/16
Office Visit

</text>
<text font_bg = '244,250,255'>
4/11/2016 Pe | MRN: 1388862

</text>
<text font_bg = '255,255,255'>
Reason for Visit
Heme/Onc Care

</text>
<table>
<tr>New Patient


</tr>
<tr>Reason for Visit History

</tr>
<tr>AVS Reports


</tr>
<tr>Date/Time Report Action User


</tr>
<tr>4/11/2016 12:09 After Visit Summary Printed Michelle Norris


</tr>
<tr>PM


</tr>
<tr>Most recent update: 4/11/2016 10:44 AM by


</tr>
<tr>Vital Signs Erica Newton


</tr>
<tr>BP Pulse Temp(Src) Resp Ht Wt


</tr>
<tr>428/102 98 36.9 °C (98.4 °F) 20 185.4 cm (72.99") 73.483 kg (162 Ib)


</tr>
<tr>mmHg (Oral)


</tr>
<tr>BMI SpO2


</tr>
</table>
<text font_bg = '255,255,255'>
21.38 kg/m2 94%

</text>
<text font_bg = '255,255,255'>
Progress Notes
Sherin Fetouh, MD at 04/11/16 1040
Status: Signed
Date of Service: 4/11/2016

</text>
<text>
Subjective:

</te

In [46]:
with open('doc22-text8.xml', 'w+') as f:
    f.write(GeneratedText)

In [17]:
with open('doc22-words-3.xml', 'w+') as f:
    f.write(docWords)