In [85]:
# Setting up by importing all the relevant libraries
import os
import cv2
import numpy as np
import pandas as pd
from scipy import misc
import pytesseract
from PIL import Image
import csv

In [10]:
# Setting parameters
dir = r'C:\Users\ankuarora\Desktop\Client\2017_05_CogEx\2017_07_R2Implementation\Sprint 7\1CreateTrainData\SampleForms'

In [53]:
# Import all images (jpg, gif & png) in the directory as a list
imgs = []
valid_images = [".jpeg"]
# [".jpg",".jpeg",".gif",".png"]
for f in os.listdir(dir):
    ext = os.path.splitext(f)[1]
    if ext.lower() not in valid_images:
        continue
    imgs.append((f, cv2.imread(os.path.join(dir,f),0)))

In [61]:
# List of initial features that needs be extracted
feature_list = ['width','height','seq_from_top','seq_from_bottom','width_prv',
                'height_prv','width_nxt','height_nxt','text']

In [83]:
# Loop through all images and slice the documents
# Also, create the initial feature space
features = pd.DataFrame(index=[], columns=feature_list)
features.index.name = "filename"
cnt = 0
for img in imgs:
    cnt+=1
    print (str(cnt) + ". Parsing '" + img[0] + "' ...")
    
    # Preprocess the image
    image = cv2.threshold(img[1], 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

    # Remove the white borders, if any
    p_image = cv2.bitwise_not(image)
    coords = np.column_stack(np.where(p_image > 0))
    x,y,w,h = cv2.boundingRect(coords)
    image_sliced = image[np.max((x-10,0)) : np.min((x+w+10,image.shape[0])),
                         np.max((y-10,0)) : np.min((y+h+10, image.shape[1]))]
    
    # Find lines by horizontally blurring the image and thresholding
    blur = cv2.blur(image_sliced, (91,9))
    b_mean = np.mean(blur, axis=1)/255
    threshold = np.percentile(b_mean, 50)
    t = b_mean > threshold
    byte_lines = np.where(1-t)
    byte_lines = byte_lines[0]
    
    # Calculate the median linespace value for defining sections 
    linspace = []
    for x in range(byte_lines.shape[0]-1):
        if byte_lines[x+1] == byte_lines[x] + 1:
            continue
        linspace.append(byte_lines[x+1]-byte_lines[x]-1)
    linspace_limit = (1.0 * np.median(linspace))

    # Add in extra byte lines to cover unwanted linespace
    for x in range(byte_lines.shape[0]-1):
        if byte_lines[x+1] == byte_lines[x] + 1:
            continue
        if ((byte_lines[x+1]-byte_lines[x]) <= linspace_limit):
            for i in range(byte_lines[x+1]-byte_lines[x]-1):
                byte_lines = np.append(byte_lines, (byte_lines[x]+i+1))
    byte_lines = np.sort(byte_lines)

    # Identify text line coordinates (y) based on byte lines
    txt_lines_y = []
    start_y = byte_lines[0]
    for y in range(1, byte_lines.shape[0]-1):
        if byte_lines[y] == byte_lines[y-1] + 1:
            continue
        # identified gap between lines, close previous line and start a new one
        end_y = byte_lines[y-1]
        txt_lines_y.append([start_y, end_y])
        start_y = byte_lines[y]
    end_y = byte_lines[-1]
    txt_lines_y.append([start_y, end_y])

    # Identify text line coordinates (x) based on non blank columns
    txt_lines_x = []
    for line in txt_lines_y:
        xx = []
        for x in range(image_sliced.shape[1]):
            col = image_sliced[line[0]:line[1], x]
            if np.min(col) < 128:
                xx.append(x)
        txt_lines_x.append([min(xx), max(xx)])

    # Slice the document based on the coordinates and perform OCR. 
    # Also, create basic features for use in learning.
    for i in range(len(txt_lines_x)):
        slc = image_sliced[np.max((txt_lines_y[i][0] - 2,0)) : np.min((txt_lines_y[i][1] + 2,image.shape[0])), 
                           np.max((txt_lines_x[i][0] - 2,0)) : np.min((txt_lines_x[i][1] + 2,image.shape[1]))]
        filename = os.path.splitext(img[0])[0] + '_slice' + str(i+1) + os.path.splitext(img[0])[1]
        misc.imsave(os.path.join(dir, "slices\\" + filename), slc)
        ocr_txt = pytesseract.image_to_string(Image.open(os.path.join(dir, "slices\\" + filename)))
        # os.remove(filename)
        features.loc[filename] = [(txt_lines_x[i][1] - txt_lines_x[i][0] + 4)/image_sliced.shape[1],
                                  (txt_lines_y[i][1] - txt_lines_y[i][0] + 4)/image_sliced.shape[0],
                                  (i+1),
                                  (len(txt_lines_y)-i),
                                  0 if not i else (txt_lines_x[i-1][1] - txt_lines_x[i-1][0] + 4)/image_sliced.shape[1],
                                  0 if not i else (txt_lines_y[i-1][1] - txt_lines_y[i-1][0] + 4)/image_sliced.shape[0],
                                  0 if i==len(txt_lines_x)-1 else (txt_lines_x[i+1][1] - txt_lines_x[i+1][0] + 4)/image_sliced.shape[1],
                                  0 if i==len(txt_lines_y)-1 else (txt_lines_y[i+1][1] - txt_lines_y[i+1][0] + 4)/image_sliced.shape[0],
                                  ocr_txt.lower().replace('\n',' ')
                                 ]

1. Parsing 'College-Recommendation-Letter-Sample.jpeg' ...


In [75]:
features

Unnamed: 0_level_0,width,height,seq_from_top,seq_from_bottom,width_prv,height_prv,width_nxt,height_nxt,text
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
College-Recommendation-Letter-Sample_slice1.jpeg,0.347921,0.023504,1.0,16.0,0.0,0.0,0.956236,0.036325,
College-Recommendation-Letter-Sample_slice2.jpeg,0.956236,0.036325,2.0,15.0,0.347921,0.023504,0.958425,0.070513,"i am pleased to recommend anita school, who ha..."
College-Recommendation-Letter-Sample_slice3.jpeg,0.958425,0.070513,3.0,14.0,0.956236,0.036325,0.956236,0.036325,in my math class for the past three years. dur...
College-Recommendation-Letter-Sample_slice4.jpeg,0.956236,0.036325,4.0,13.0,0.958425,0.070513,0.95186,0.036325,anita is a rare type of stident who combines e...
College-Recommendation-Letter-Sample_slice5.jpeg,0.95186,0.036325,5.0,12.0,0.956236,0.036325,0.954048,0.070513,ability with a wilinaness and eacemess to leam...
College-Recommendation-Letter-Sample_slice6.jpeg,0.954048,0.070513,6.0,11.0,0.95186,0.036325,0.954048,0.036325,help her peers with dificult mathematical conc...
College-Recommendation-Letter-Sample_slice7.jpeg,0.954048,0.036325,7.0,10.0,0.954048,0.070513,0.954048,0.036325,avare of her natural ability in the area of ma...
College-Recommendation-Letter-Sample_slice8.jpeg,0.954048,0.036325,8.0,9.0,0.954048,0.036325,0.956236,0.036325,constantly challencing herself she is part of ...
College-Recommendation-Letter-Sample_slice9.jpeg,0.956236,0.036325,9.0,8.0,0.954048,0.036325,0.956236,0.036325,"team, which - challenges: local - students in"
College-Recommendation-Letter-Sample_slice10.jpeg,0.956236,0.036325,10.0,7.0,0.956236,0.036325,0.956236,0.036325,mathematical competitions. she is also active ...


In [71]:
features['type'] = ""

In [87]:
pd.DataFrame.to_csv?

In [84]:
# Dump the features list as csv for manual annotation
features.to_csv('features_raw.csv')

print ("Slice list & features json created successfully")

Slice list & features json created successfully
