In [6]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
from pdf2image import convert_from_path

import glob
import os
import shutil

def pdf_to_img(dir, filepath):
    # Get file name
    # fname = os.path.split(filepath)[1]

    # Create a temp folder to store extracted images
    try: 
        os.mkdir(dir[0] + '/temp')
    except OSError as e:
        print("Error: %s" % (e.strerror))

    # Convert PDF pages into images            
    images = convert_from_path(filepath)
    for i in range(len(images)):
        images[i].save(dir[0] + '/' + 'temp/' + dir[1] + '_' + str(i).zfill(4) +'.jpg', 'JPEG')
    
    return glob.glob(dir[0] + '/' + 'temp/*.jpg')
def remove_temp_folder(dir):
    try:
        shutil.rmtree(dir + '/temp')
    except OSError as e:
        print("Error : %s" % (e.strerror))
def sort_contours(cnts, method="left-to-right"):
    # initialize the reverse flag and sort index
    reverse = False
    i = 0
    # handle if we need to sort in reverse
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True
    # handle if we are sorting against the y-coordinate rather than
    # the x-coordinate of the bounding box
    if method == "top-to-bottom" or method == "bottom-to-top":
        i = 1
    # construct the list of bounding boxes and sort them from top to
    # bottom
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
    key=lambda b:b[1][i], reverse=reverse))
    # return the list of sorted contours and bounding boxes
    return (cnts, boundingBoxes)
def detect_lines(img):
    #thresholding the image to a binary image
    thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY |cv2.THRESH_OTSU)
    #inverting the image 
    img_bin = 255-img_bin
    # Length(width) of kernel as 100th of total width
    kernel_len = np.array(img).shape[1]//100
    # Defining a vertical kernel to detect all vertical lines of image 
    ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
    # Defining a horizontal kernel to detect all horizontal lines of image
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
    # A kernel of 2x2
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    #Use vertical kernel to detect and save the vertical lines in a jpg
    image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
    vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)
    #Use horizontal kernel to detect and save the horizontal lines in a jpg
    image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
    horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)
    # Combine horizontal and vertical lines in a new third image, with both having same weight.
    img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    #Eroding and thesholding the image
    img_vh = cv2.erode(~img_vh, kernel, iterations=2)
    thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    bitxor = cv2.bitwise_xor(img,img_vh)
    bitnot = cv2.bitwise_not(bitxor)
    # Detect contours for following box detection
    contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)  
    # Sort all the contours by top to bottom.
    contours, boundingBoxes = sort_contours(contours, method='top-to-bottom')
    return bitnot, contours, boundingBoxes
def get_cells(img, contours, boundingBoxes):
    #Creating a list of heights for all detected boxes
    heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
    #Get mean of heights
    mean = np.mean(heights)
    #Create list box to store all boxes in  
    box = []
    # Get position (x,y), width and height for every contour and show the contour on image
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if (w<1000 and h<500):
            image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
            box.append([x,y,w,h])
    #Creating two lists to define row and column in which cell is located
    row=[]
    column=[]
    j=0
    #Sorting the boxes to their respective row and column
    for i in range(len(box)):
        if(i==0):
            column.append(box[i])
            previous=box[i]
        else:
            if(box[i][1]<=previous[1]+mean/2):
                column.append(box[i])
                previous=box[i]
                if(i==len(box)-1):
                    row.append(column)
            else:
                row.append(column)
                column=[]
                previous = box[i]
                column.append(box[i])
    #calculating maximum number of cells
    countcol = 0
    for i in range(len(row)):
        countcol = len(row[i])
        if countcol > countcol:
            countcol = countcol

    #Retrieving the center of each column
    center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]]
    center=np.array(center)
    center.sort()
    #Regarding the distance to the columns center, the boxes are arranged in respective order
    finalboxes = []
    for i in range(len(row)):
        lis=[]
        for k in range(countcol):
            lis.append([])
        for j in range(len(row[i])):
            diff = abs(center-(row[i][j][0]+row[i][j][2]/4))
            minimum = min(diff)
            indexing = list(diff).index(minimum)
            lis[indexing].append(row[i][j])
        finalboxes.append(lis)
    return row, countcol, finalboxes
def extract_text(dir, bitnot, row, countcol, finalboxes):
    #from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
    outer=[]
    for i in range(len(finalboxes)):
        for j in range(len(finalboxes[i])):
            inner=''
            if(len(finalboxes[i][j])==0):
                outer.append(' ')
            else:
                for k in range(len(finalboxes[i][j])):
                    y,x,w,h = finalboxes[i][j][k][0],finalboxes[i][j][k][1], finalboxes[i][j][k][2],finalboxes[i][j][k][3]
                    finalimg = bitnot[x:x+h, y:y+w]
                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                    border = cv2.copyMakeBorder(finalimg,2,2,2,2,   cv2.BORDER_CONSTANT,value=[255,255])
                    resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                    dilation = cv2.dilate(resizing, kernel,iterations=1)
                    erosion = cv2.erode(dilation, kernel,iterations=1)

                    out = pytesseract.image_to_string(erosion)
                    if(len(out)==0):
                        out = pytesseract.image_to_string(erosion, config='--psm 3')
                    inner = inner +" "+ out.replace('\x0c', '')
                outer.append(inner)
    print(outer)

    #Creating a dataframe of the generated OCR list
    arr = np.array(outer)
    dataframe = pd.DataFrame(arr.reshape(len(row),countcol))
    print(dataframe)
    data = dataframe.style.set_properties(align="left")
    #Converting it in a excel-file
    data.to_excel(dir[0] + '/output.xlsx', encoding = 'utf-8')

    
def main_convert(lspaths):    


    for pdffile in lspaths:
        dir = os.path.split(pdffile)
        print(dir)
        images = pdf_to_img(dir, pdffile)
        
        for img in images:
            #read your file
            img = cv2.imread(img,0)
            img.shape
            bitnot, contours, boundingBoxes = detect_lines(img)
            row, countcol, finalboxes       = get_cells(img, contours, boundingBoxes)
            outer                           = extract_text(dir, bitnot, row, countcol, finalboxes)



        remove_temp_folder(dir)





if __name__== "__main__":
    path = glob.glob('/home/anvnt/Documents/PDF-SaoKeNH/*.pdf')
    main_convert(path)


('/home/anvnt/Documents/PDF-SaoKeNH', '63000T9short.pdf')
Error: File exists
[' Ngay\nDate\n 03/09/2020\n', ' Ghi chi\nRemark\n S506TPKNLIEM\nMA_GD:105476311|k194050663,\nHoc phi,\nHK05/2019-2020[195EN02@1@156\nHK05/2019-2020\n2050000][BPMENT-DP-556310]\n', ' Ng\nDr\n ', ' Co\nCr\n 2,050,000\n', ' Sé6 du\n\nBalance\n 389,081,240\n', ' S6 tham chiéu\nRef. no\n VBA\n', ' 03/09/2020\n 04/09/2020\n', ' C19607069 DAI HOANG NGUYEN\nHOC LAI MON TRIET HOC\n MA_GD:105535254|K194040489,\nHoc\nphiHK01,HK01/2020-2021[1624182:\nHK01/2020-2021\n50000][1623687@4@1623687@Phit\nHK01/2020-2021\n4[BPMENT-FT-615050]\n', '  ', ' 2,200,000\n 95,000\n', ' 391,281,240\n 391,376,240\n', ' 64210TT201025858\n ', ' 04/09/2020\n 04/09/2020\n', ' MB(091028)(K194070943, Nguyen\nMinh Thu, Nop hoc phi HK\nHE-19-20)\n 590STBOBTDMY\nMA_GD:105552499|K17503KTLO71\nHoc phi,\nHK02/2019-2020[1927DS05@1@14\nHK02/2019-2020\n750000][BPMENT-DP-632192]\n', '  ', ' 822,000\n 750,000\n', ' 392,198,240\n 392,948,240\n', ' 2020090409

TypeError: can only concatenate tuple (not "str") to tuple