In [163]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

In [164]:
img = cv2.imread(r"test_images\test5.png")

In [165]:
def img2gray(img):
    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    return gray

In [166]:
def findHorizontalLines(img):

    gray = img2gray(img)
    
    # set threshold to remove background noise
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    kernel = np.ones((3,3),np.uint8)
    dilate = cv2.dilate(thresh, kernel, iterations=1)
    
    # define rectangle structure (line) to look for: width 100, hight 1. This is a 
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (70,1))
    
    # Find horizontal lines
    lineLocations = cv2.morphologyEx(dilate, cv2.MORPH_OPEN, horizontal_kernel, iterations=1)
    
    kernel1 = np.ones((3,3),np.uint8)
    erosion = cv2.erode(lineLocations,kernel1,iterations = 1)
    
    return erosion

In [167]:
def dataframe(img):
    lineLocations = (findHorizontalLines(img)/255).astype(np.uint8)
    df_lineLocations = pd.DataFrame(lineLocations.sum(axis=1)).reset_index()
    df_lineLocations.columns = ['rowLoc', 'LineLength']
    df = df_lineLocations[df_lineLocations['LineLength'] > 0]
    
    return df,lineLocations

In [169]:
def col_st(img):
    df,lineLocations = dataframe(img)
    col_st_list = []
    for i in df.index:
        for j in range (lineLocations.shape[1]):
            if lineLocations[i,j]==1:
                col_st_list.append(j)
                break
    return col_st_list

In [171]:
def mask_n_crop(img):
    img_gray = img2gray(img)
    df, linelocations = dataframe(img)
    col_st_list = col_st(img)
    list_cropped = []
    for i in range(len(df)):
        row = df.iloc[i][0]
        col_start = col_st_list[i]
        length =  df.iloc[i][1]
        col_end = col_start + length
        list_cropped.append(img_gray[row-20:row+5,col_start:col_end-1])
    return list_cropped

In [172]:
def remove_line(img):
    list_cropped_imgs = mask_n_crop(img)
    
    lines_removed = []
    
    for gray in list_cropped_imgs:
        if (gray.shape[0] != 0):
            
            thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

            # Remove horizontal
            horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25,1))
            detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
            cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            cnts = cnts[0] if len(cnts) == 2 else cnts[1]
            for c in cnts:
                cv2.drawContours(gray, [c], -1, (255,255,255), 2)

            # Repair image
            repair_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,6))
            result = 255 - cv2.morphologyEx(255 - gray, cv2.MORPH_CLOSE, repair_kernel, iterations=1)

            #make the image binary
            f = 255 * (np.round(result/255).astype(np.uint8))
            
            lines_removed.append(f)
    
    return lines_removed

In [194]:
# cv2.imshow("img",contour(img)[6][4])
# cv2.waitKey(0)
# cv2.destroyAllWindows()

In [181]:
def contour(img):
    lines_removed = remove_line(img)

    list_contours = []
    
    for img_ in lines_removed:
        thresh1 = cv2.threshold(img_, 0, 255,cv2.THRESH_OTSU|cv2.THRESH_BINARY_INV)[1]
        rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 4))
        dilation = cv2.dilate(thresh1, rect_kernel, iterations = 1)

        contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

        word_imgs = []

        im2 = img_.copy()
        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)

            img = im2[y:y+h,x:x+w]

            scale_percent = 200 # percent of original size
            width = int(img.shape[1] * scale_percent / 100)
            height = int(img.shape[0] * scale_percent / 100)
            dim = (width, height)

            # resize image
            resized = cv2.resize(img, dim, interpolation = cv2.INTER_CUBIC)

            word_imgs.append(resized)
        list_contours.append(word_imgs)
        
    return list_contours

In [195]:
# def fun(line_img):
#     line = ''
#     if line_img.shape[0] == 0:
#         return 0
#     words = contour(line_img)
#     key = len(words)
#     di = {key:words}
#     while key>0:
#         line += service(words[key]) + " "
#         key-=1
#     return line

In [206]:
def fun(img):
    list_contours = contour(img)
    list1 = []
    for i in range(len(list_contours)):
        line = ''
        for j in range(len(list_contours[i])-1,-1,-1):
            line += service(list_contours[i][j]) + " "
        list1.append(line)
    return list1