In [119]:
import cv2
import os
import pytesseract as pyt
from pathlib import Path
import re
import numpy as np
import imutils
import math
from matplotlib import pyplot as plt
from PIL import Image
import pytesseract


def filenum(fname):
    return int(fname[3:len(fname) - 4])

def get_thresh_files(directory):
    dirpath = directory+'/images/form_20/'
    
    files = [f for f in os.listdir(dirpath) if (os.path.isfile(os.path.join(dirpath, f)) and 'thr' in f)]
    #files.sort()
    files.sort(key=filenum)
    for x in range(0, len(files)):
        files[x] = dirpath + files[x]
    return files

def get_files(directory):
    dirpath = directory+'/images/form_20/'
    
    files = [f for f in os.listdir(dirpath) if (os.path.isfile(os.path.join(dirpath, f)) and 'Img' in f)]
    #files.sort()
    files.sort(key=filenum)
    for x in range(0, len(files)):
        files[x] = dirpath + files[x]
    return files

def pre_process_images(directory):
    f_names = get_files(directory)
    
    for i in range(0, len(f_names)):                                #len(f_names)):
        img = cv2.imread(f_names[i], cv2.IMREAD_GRAYSCALE) #Read as grayscale one channel only for easier processing
        ret,thresh1 = cv2.threshold(img, 180, 255, cv2.THRESH_BINARY) #high threshold to get more data
        inv = 255 - thresh1
        x, y, h, w = 80, 30, 700, 1100              #change these values if image usable boundaries change
        inv = inv[y:y+h, x:x+w] 
        thresh1 = thresh1[y:y+h, x:x+w]
        
        hr_len = inv.shape[1] // 50   
        hr_ker = cv2.getStructuringElement(cv2.MORPH_RECT, (hr_len, 1))
        hr_open = cv2.morphologyEx(inv, cv2.MORPH_OPEN, hr_ker)
        
        vl_ker = cv2.getStructuringElement(cv2.MORPH_RECT, (1, hr_len))
        vl_open = cv2.morphologyEx(inv, cv2.MORPH_OPEN, vl_ker)
        
        boxes = cv2.add(hr_open, vl_open)
        contours, hierarchy = cv2.findContours(boxes, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
        cnt_img = np.zeros(boxes.shape)
        
        angles = list()
        for cntr in contours:
            rect = cv2.minAreaRect(cntr)
            angles.append(int(math.floor(rect[2])))
            box = cv2.boxPoints(rect)
            box = np.int0(box)
            cv2.drawContours(cnt_img, [box], 0, 255, 2)
        
        rot_angle = max(set(angles), key = angles.count)
        if(rot_angle == 0):
            rot_angle = -90
        cnt_img = imutils.rotate(cnt_img, 90 + rot_angle)
        cnt_img = cnt_img.astype('uint8') 
        orig_img_rot = imutils.rotate(thresh1, 90 + rot_angle)
        join = cv2.subtract(orig_img_rot, cnt_img)
        cv2.imwrite(f_names[i].replace('Img', 'thj'), orig_img_rot)
        cv2.imwrite(f_names[i].replace('Img', 'cnt'), cnt_img)
        cv2.imwrite(f_names[i].replace('Img', 'thr'), join)
        
        h, w = cnt_img.shape
        
        hor_mat = np.zeros((1, w), dtype=np.uint8)
        ver_mat = np.zeros((h, 1), dtype=np.uint8)
        
        hor_img = Image.fromarray(hor_mat)
        ver_img = Image.fromarray(ver_mat)
        hor_img.save(directory+'/images/comp/hor_vc.png')
        ver_img.save(directory+'/images/comp/ver_vc.png')
        
        #os.remove(f_names[i])

def find_boxes(f_name, directory):
    
    hor = cv2.imread(directory+'/images/comp/hor_vc.png', cv2.IMREAD_GRAYSCALE)
    ver = cv2.imread(directory+'/images/comp/ver_vc.png', cv2.IMREAD_GRAYSCALE)
    neighborhood = 10
    img = cv2.imread(f_name, cv2.IMREAD_GRAYSCALE) #Read as grayscale one channel only for easier processing
    h, w = img.shape
    x_val = list()
    y_val = list()
    x_rows = list()
    y_cols = list()
    
    x_windows = [25, 170, 30]
    y_windows = [25, 75]
    
    for row in range(0, h):
        vl = np.zeros((1, w), dtype=np.uint8)
        vl = img[row:row+1, 0:w]
        diff = cv2.subtract(vl, hor)
        x_val.append(cv2.countNonZero(diff))
    
    tmp = x_val[0:x_windows[0]]
    min_x_idx = tmp.index(min(tmp))
    x_rows.append(min_x_idx)
    
    tmp = x_val[min_x_idx+3:min_x_idx+x_windows[1]]
    min_x_idx += tmp.index(min(tmp)) + 3
    x_rows.append(min_x_idx)
    
    row = min_x_idx + 1
    while row < h:
        tmp = x_val[row:row+x_windows[2]]
        min_x_idx += tmp.index(min(tmp)) + 1
        if(min(tmp) < 200 and min_x_idx - x_rows[-1] > 2):
            x_rows.append(min_x_idx)
        row = min_x_idx + 1            #don't look at the next 5 rows 
    
    for idx in range(2, len(x_rows)):
        if(x_rows[idx+1] - x_rows[idx] > 30):
            x_rows.insert(idx+1, x_rows[idx] + 25)
    
    print(x_rows)
    
    for col in range(0, w):
        vl = np.zeros((h, 1), dtype=np.uint8)
        vl = img[0:h, col:col+1]
        diff = cv2.subtract(vl, ver)
        y_val.append(cv2.countNonZero(diff))
        
    tmp = y_val[0:y_windows[0]]
    min_y_idx = tmp.index(min(tmp))
    y_cols.append(min_y_idx)
    
    col = min_y_idx + 1
    while col < w:
        if(y_val[col] < 200 and col - y_cols[-1] > 5):
            y_cols.append(col)
        col += 1
    
    print(y_cols)
    
    box = list()
        
    for row in range(0, len(x_rows)-1):
        for col in range(0, len(y_cols)-1):
            box.append([y_cols[col], x_rows[row], y_cols[col+1], x_rows[row+1]])
    
    #print(box)
    return box
        
    
if __name__ == '__main__':
    cwd = os.getcwd()
    pre_process_images(str(Path(cwd).parent.parent))
    f_names = get_thresh_files(str(Path(cwd).parent.parent))
    for i in range(1, 2):          #len(f_names)
        boxes = find_boxes(f_names[i], str(Path(cwd).parent.parent))
        values = list()
        img = cv2.imread(f_names[i], cv2.IMREAD_GRAYSCALE)
        im = Image.open(f_names[i])
        index = 1
        for index in range(0, len(boxes)):
            box = boxes[index]
            img_crop = img[box[1]:box[3], box[0]:box[2]]
            h, w = img_crop.shape
            for row in range(0, h):
                for col in range(0, 3):
                    img_crop[row, col] = 255
                    img_crop[row, w-col-1] = 255
            for col in range(0, w):
                for row in range(0, 3):
                    img_crop[row, col] = 255
                    img_crop[h-row-1, col] = 255
                    
            for row in range(0, h):
                for col in range(0, w):
                    if(img_crop[row, col] < 225):
                        img_crop[row, col] = 0
                    else:
                        img_crop[row, col] = 255
            im_crop = Image.fromarray(obj=img_crop)
            im_crop = im_crop.resize([x * 3 for x in im_crop.size], Image.NEAREST)
            im_crop.save(str(Path(cwd).parent.parent)+ '/images/tmp/box' + str(index) + '.png')
            if(index > 16):
                texts = pytesseract.image_to_string(im_crop,
                        lang='eng',
                        config='--psm 7 -c tessedit_char_whitelist=0123456789'
                        )
            else:
                im_crop = im_crop.rotate(-90, Image.NEAREST, expand = 1) 
                texts = pytesseract.image_to_string(im_crop,
                        lang='eng',
                        config='--psm 7'
                        )
            #print(str(index)+ ' : ' +texts)
            if(texts == ''):
                texts = 'NULL'
            values.append(texts.replace('|', '').strip())
            index += 1
            
        index = 0
        while index < len(values):
            print(','.join(values[index:index+16]))
            index += 16

[23, 185, 205, 230, 256, 281, 306, 331, 356, 381, 406, 431, 456, 481, 506, 532, 557, 582, 607, 632, 658]
[24, 90, 158, 226, 281, 337, 394, 461, 518, 573, 629, 702, 771, 823, 878, 954, 1039]
"Serta! Noo ot,iat,Fula Devi,Mokim Uddin,aan teishna ae,- ViayRam FE,Nic. Kannan E,Ramchandra Rajwar E,Shankar Prasad  :,Safwan Kumar  :,Satandra Mochi,Total of Valid Vote,No, of rejected votes,NOTA,Tota},ve stantusdenee
.,NULL,NULL,NULL,5,6,7,NULL,NULL,740,1,12,1,NULL,35,16
42,27,9,2,7,4,313,33,320,49,4,392,NULL,47,NULL,69
33,357,440,42,2,4,233,54,23,55,30,733,NULL,68,851,8
44,53,133,12,18,16,4,18,NULL,58,6,349,NULL,19,365,NULL
145,24,97,4,NULL,2,23,2,8,63,6,307,NULL,19,326,0
316,117,26,5,NULL,9,231,35,10,7,5,440,NULL,37,477,0
470,1,62,NULL,4,5,171,18,9,1,3,358,NULL,9,367,NULL
18,56,56,1,NULL,3,135,5,4,73,9,342,0,16,358,0
19,98,412,9,11,15,230,430,43,26,NULL,561,0,50,611,0
120,57,36,5,2,4,52,NULL,1,402,3,266,0,3,269,0
121,83,7,0,NULL,3,92,43,5,79,5,252,NULL,17,269,0
1372,94,37,5,NULL,5,347,33,49,41