## The File for generating HTML tags

In [0]:
from wand.image import Image as wi
import os; from io import BytesIO; import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import time
import re
import pickle as pk
from nltk.tokenize import word_tokenize

from tesserocr import PyTessBaseAPI, RIL, iterate_level, PT, OEM

In [0]:
with open("complete_dict.pkl", 'rb') as f:
    dictionary = pk.load(f)

In [0]:
len(dictionary)

629526

In [0]:
pytestapi_path = "C:\\Tesseract\\Tesseract-OCR-v5\\tessdata"

In [0]:
def sorted_nicely(l):
    """ Sort the given iterable in the way that humans expect."""
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)

In [0]:
def word_tokenizer(text):
    #expressions = ["[^a-z]+", ""]
    tokens = re.findall("[a-z]+", text)
    return tokens

In [0]:
def get_words_dict_counts(tess_word):
    '''
    params:
    string text
    returns:
    words count
    words count, not in dictionary
    list of words in input strring
    '''
    words_count = 0
    wordsNotInDict = 0
    Words = ""
    
    lower_words = word_tokenizer(tess_word)
    for word in lower_words:
        words_count += 1
        Words += word + "\n"
        if word not in dictionary:  #assumption that dictionary is global variuable containing all dictionary words in alist  
            wordsNotInDict += 1
    
    return words_count, wordsNotInDict, Words

In [0]:
def get_dirt_from_page(filepath):
    tmpTotal_words = 0
    tmpNot_in_dict = 0
    tmpWords = ""
    
    page = Image.open(filepath) #readimage in PIL format
    print("MR chart '", filepath, "' is under-process")
    DICT_time = 0
    
    with PyTessBaseAPI(path = pytestapi_path) as api:
        api.SetImage(page)
        api.Recognize()
        ## Preprosessing for background detection --START
        ri = api.GetIterator()
        level = RIL.WORD
        
        # Word by word iterator 
        for r in iterate_level(ri, level):
            if r:
                s = time.time()
                tmp = r.GetUTF8Text(level)
                if (tmp.rstrip()):
                    w_c, wd_c, w = get_words_dict_counts(r.GetUTF8Text(level).lower())
                    tmpTotal_words += w_c
                    tmpNot_in_dict += wd_c
                    tmpWords += w
                DICT_time += time.time() - s
                
    print("Dictionary check time:\t", DICT_time)
    return tmpNot_in_dict, tmpTotal_words, tmpWords

In [0]:
def generate_dirt_params (FOLDERpath):
    '''
    params:
    FOLDERpath --> path of directoray which contain all the MR document images
    returns:
    - total number of words, not in dictionar 
    - total number of words 
    - the words list from file 
    - ratio of dirt on overall file
    - list of dirt ratio per page
    '''
    
    WordsofPages = ""
    WordsofPages += "<document>\n"
    total_words = []
    not_in_dict = []
    
    pagecounter  = 1
    for filename in sorted_nicely(os.listdir(FOLDERpath)):
        print(pagecounter)
        start = time.time()
        tmpNot_in_dict, tmpTotal_words, tmpWords = get_dirt_from_page(FOLDERpath+"\\"+filename)
        
        WordsofPages += "<page"+str(pagecounter)+">\n"
        WordsofPages += tmpWords
        
        print("Page Time:\t\t", time.time() - start)
        
        WordsofPages += "<\page"+str(pagecounter)+">\n"
        pagecounter += 1
        
        total_words.append(tmpTotal_words)
        not_in_dict.append(tmpNot_in_dict)
        
    WordsofPages += "</document>\n"
    
    ratio_ppage = [n/c for n, c in zip(not_in_dict, total_words)]
    
    return not_in_dict, total_words, WordsofPages, sum(not_in_dict)/sum(total_words), ratio_ppage

In [0]:
FOLDERpath = '..\\datafiles\\doc22'

In [0]:
COST = time.time()
not_, total, docWords, ratio, ppage_ratio = generate_dirt_params(FOLDERpath)
tmp_time = time.time() - COST
print("Time on Document:\t", tmp_time)

1
MR chart ' ..\datafiles\doc22\doc22-0.png ' is under-process
Dictionary check time:	 1.2311134338378906
Page Time:		 4.5507612228393555
2
MR chart ' ..\datafiles\doc22\doc22-1.png ' is under-process
Dictionary check time:	 0.43228983879089355
Page Time:		 2.5583908557891846
3
MR chart ' ..\datafiles\doc22\doc22-2.png ' is under-process
Dictionary check time:	 1.1835675239562988
Page Time:		 5.013810634613037
4
MR chart ' ..\datafiles\doc22\doc22-3.png ' is under-process
Dictionary check time:	 0.7110967636108398
Page Time:		 3.5064847469329834
5
MR chart ' ..\datafiles\doc22\doc22-4.png ' is under-process
Dictionary check time:	 0.7306394577026367
Page Time:		 3.406902313232422
6
MR chart ' ..\datafiles\doc22\doc22-5.png ' is under-process
Dictionary check time:	 0.5166163444519043
Page Time:		 2.4881813526153564
7
MR chart ' ..\datafiles\doc22\doc22-6.png ' is under-process
Dictionary check time:	 0.44617629051208496
Page Time:		 2.0833702087402344
8
MR chart ' ..\datafiles\doc22\do

In [0]:
print("Average time:\t\t", tmp_time/22)

Average time:		 3.051985903219743
