In [None]:
from PIL import Image
from pprint import pprint
import re
import pyocr
import pyocr.builders
import Levenshtein as lev

In [3]:
tools = pyocr.get_available_tools()
assert len(tools) > 0
tool = tools[0]
print(tool)

<module 'pyocr.tesseract' from '/usr/local/lib/python3.5/dist-packages/pyocr/tesseract.py'>


In [None]:
#'IMG_20180710_170326.jpg' - 10.07.2018 general
#'IMG_20180710_175215.jpg' - 9.07.2018 general
#'IMG_20180714_132310.jpg' - 13.07.2018 additional

In [82]:
DATE_REGEX = '(\d{2}\.\d{2}\.\d{4})'
SIGNATURE_TEXT = 'UZM DR. Naile Fevziye MISIRLIOĞLU'
SIGNATURE_TEXT_THRESHOLD = 4
JUNK_WORDS = ['L', 'H', '#', '%', u'Miliyon/μL', 'g/dL', 'Bin/μL', 'mEq/L']
JUNK_WORD_THRESHOLD = 2
MARGIN = 10

In [87]:
def get_lines_bounding_boxes(image):    
    grayscale = image.convert('L')
    return tool.image_to_string(
        grayscale,
        lang="tur",
        builder=pyocr.builders.LineBoxBuilder()
    )

def is_float(text):
    try:
        float(text)
        return True
    except ValueError:
        return False

def test_for_regex(boxes, regex):
    for box in boxes:
        m = re.match(regex, box.content)
        if m:
            return box
        
def get_date(boxes):
    return test_for_regex(boxes, DATE_REGEX)
  
def find_line_with_text(text, lines):
    for i, line in enumerate(lines):
        for word_box in line.word_boxes:
            if word_box.content == text:
                return i, line

def clean_empty_lines(lines):
    for line in lines:
        if not line.content == '':
            yield line
            
def clean_signature(lines):
    for line in lines:
        if lev.distance(line.content, SIGNATURE_TEXT) > SIGNATURE_TEXT_THRESHOLD:
            yield line
            
def to_list_of_strings(lines):
    for line in lines:
        yield [word.content for word in line.word_boxes]
        
def remove_junk_words(line):
    def removing_gen():
        for i, word in enumerate(line):
            is_junk_word = False
            for junk_word in JUNK_WORDS:
                if lev.distance(word, junk_word) <= JUNK_WORD_THRESHOLD and i > 0 and not is_float(word):
                    is_junk_word = True
                    break
            if is_junk_word:
                continue
            yield word
        
    return list(removing_gen())

def join_all_except_last(line):
    def join_gen():
        last = line[-1]
        if is_float(last):            
            other = line[:-1]
            yield ' '.join(other)
            yield last
        else:
            yield ' '.join(line)
        
    return list(join_gen())

def add_zero_values(line):
    if len(line) < 2:
        line.append(0.0)
    return line

In [92]:
image = Image.open('IMG_20180710_170326.jpg')
lines = get_lines_bounding_boxes(image)

In [93]:
_, line = find_line_with_text('Onay', lines)
date = get_date(line.word_boxes)

print(date.content)

10.07.2018


In [94]:
index, line = find_line_with_text('TESTLER', lines)
blood_values_lines = lines[index + 1:]
blood_values_lines = clean_empty_lines(blood_values_lines)
blood_values_lines = clean_signature(blood_values_lines)
blood_values_lines = to_list_of_strings(blood_values_lines)
blood_values_lines = [remove_junk_words(line) for line in blood_values_lines]
blood_values_lines = [join_all_except_last(line) for line in blood_values_lines]
blood_values_lines = [add_zero_values(line) for line in blood_values_lines]
list(blood_values_lines)

[['RBC—Alyuvarlar', '2.99'],
 ['Hâlâ—Hemoglobin', '8.8'],
 ['_li_lÇTıljematokrit', '24.8'],
 ['_MÇX', '82.9'],
 ['MÇFL', '29.4'],
 ['MCHC', '35.5'],
 ['RDW—SD', '37.2'],
 ['RDW', '12.3'],
 ['WBC-Akyuvarlar', '5.55'],
 ['Nötrofil', '84.1'],
 ['Lenfosit', '13.7'],
 ['Monosit 1_3', 0.0],
 ['Eozinofil', '0.0'],
 ['Bazofil', '0.9'],
 ['Nötrofil', '4.67'],
 ['Lenfosit', '0.76'],
 ['Monosit', '0.07'],
 ['Eozinofil', '0.00'],
 ['Bazofil', '0.05'],
 ['PLT-Trombosit', '98'],
 ['MPV', '10.2'],
 ['PDW', '12.0'],
 ['PCT', '0.10']]