In [None]:
# TODO: Read files: image (TDET + POCR + VOCR), pdf, excel, docx, text
# TODO: Combine TDET + POCR + VOCR -> VDOCR
# TODO: Put VDOCR into gradio app

In [None]:
from pkg.VDOCR.TDET.TDET import Process_TDET
from pkg.VDOCR.POCR.POCR import Process_POCR
from pkg.VDOCR.VOCR.VOCR import Process_VOCR
import pkg.UTILS.UTILS as UTILS
import cv2

In [None]:
# bb1: The inside bbox - bb2: The outside bbox
def VDOCR_bbox_in_bbox_ratio(bb1, bb2):
    def bbox_area(bb):
        return (bb[2] - bb[0]) * (bb[3] - bb[1])
    def bbox_overlap(bb1, bb2):
        x1 = max(bb1[0], bb2[0])
        y1 = max(bb1[1], bb2[1])
        x2 = min(bb1[2], bb2[2])
        y2 = min(bb1[3], bb2[3])
        if x1>=x2 or y1>=y2:
            return 0
        else:
            return bbox_area((x1, y1, x2, y2))
    if bb1[0]>=bb2[0] and bb1[2]<=bb2[2] and bb1[1]>=bb2[1] and bb1[3]<=bb2[3]:
        return 1.0
    if (bb1[0]>bb2[0] and bb1[0]>bb2[2] and bb1[2]>bb2[0] and bb1[2]>bb2[2]) or (bb1[0]<bb2[0] and bb1[0]<bb2[2] and bb1[2]<bb2[0] and bb1[2]<bb2[2]):
        return 0.0
    if (bb1[1]>bb2[1] and bb1[1]>bb2[3] and bb1[3]>bb2[1] and bb1[3]>bb2[3]) or (bb1[1]<bb2[1] and bb1[1]<bb2[3] and bb1[3]<bb2[1] and bb1[3]<bb2[3]):
        return 0.0
    return bbox_overlap(bb1, bb2) / bbox_area(bb1)
# bb1: The text bbox - bb2: The table cell bbox
def VDOCR_get_bbox_cut_from_overlap(bb1, bb2):
    return (max(bb1[0],bb2[0]), max(bb1[1],bb2[1]), min(bb1[2],bb2[2]), min(bb1[3],bb2[3]))
def VDOCR_bboxes_2_rowclusters(texts_bboxes):
    if texts_bboxes == []:
        return []
    else:
        def VDOCR_clustering_idx(ls, max_distance=5):
            import numpy as np
            # ls: a list of numbers
            # max_distance: max distance between 2 numbers in 1 cluster
            # Return clusters, each cluster contains indexs of the numbers in original list
            ls = np.array(ls)
            sorted_indices = np.argsort(ls)
            split_points = np.where(np.diff(ls[sorted_indices]) > max_distance)[0] + 1
            clusters = np.split(sorted_indices, split_points)
            return clusters
        # Cluster the text boxes based on their row indexes
        texts_bboxes_clusters = [[texts_bboxes[idx] for idx in row_cluster_idxs] for row_cluster_idxs in VDOCR_clustering_idx([text_bbox[1] for text_bbox in texts_bboxes])]
        # Sort each cluster left-to-right (by x-coordinate)
        texts_bboxes_clusters = [sorted(cluster, key=lambda e1: e1[0]) for cluster in texts_bboxes_clusters]
        # Sort clusters top-to-bottom (by y-coordinate of the first element in each cluster)
        texts_bboxes_clusters.sort(key=lambda cluster: cluster[0][1])
        # Return
        return texts_bboxes_clusters
def VDOCR_get_bg_color(img_ocv, groups=25):
    import numpy as np
    import cv2
    try:
        gray = cv2.cvtColor(img_ocv, cv2.COLOR_BGR2GRAY)
        hist = cv2.calcHist([gray], [0], None, [groups], [0, 256])
        dominant_group = np.argmax(hist)
        background_brightness = int((dominant_group + 0.5) * (256 / groups))
        return (background_brightness,) * 3
    except Exception as error:
        print(f"⚠️ VDOCR > VDOCR_get_bg_color > Error: {error}")
        return (250, 250, 250)
def VDOCR_add_blank_margin(img_ocv, margin_ratio=0.1):
    import math
    import cv2
    h, w = img_ocv.shape[:2]
    padding = math.ceil(h * margin_ratio)
    padding_color = VDOCR_get_bg_color(img_ocv)
    return cv2.copyMakeBorder(img_ocv, padding, padding, padding, padding, cv2.BORDER_CONSTANT, value=padding_color)

def Process_VOCR_with_blank_margin(img_ocv):
    return Process_VOCR(VDOCR_add_blank_margin(img_ocv))

In [None]:
img_ocv = cv2.imread("_test/img_4.jpg")
img_ocv = UTILS.preprocess_document_image(img_ocv)

In [None]:
tables = Process_TDET(img_ocv)
texts_bboxes = Process_POCR(img_ocv)

In [None]:
for i1, tbl in enumerate(tables):
    for i2, cell in enumerate(tbl):
        tables[i1][i2]['text_bboxes'] = []

texts_bboxes_nontable = []
for text_bbox in texts_bboxes:
    _flag_inside_table = False
    for i1, tbl in enumerate(tables):
        for i2, cell in enumerate(tbl):
            cell_bbox = cell['bbox']
            if VDOCR_bbox_in_bbox_ratio(text_bbox, cell_bbox) > 0.25:
                _flag_inside_table = True
                tables[i1][i2]['text_bboxes'].append(VDOCR_get_bbox_cut_from_overlap(text_bbox, cell_bbox))
    if _flag_inside_table == False:
        texts_bboxes_nontable.append(text_bbox)

for i1, tbl in enumerate(tables):
    for i2, cell in enumerate(tbl):
        tables[i1][i2]['rowclusters'] = VDOCR_bboxes_2_rowclusters(cell['text_bboxes'])

rowclusters_nontable = VDOCR_bboxes_2_rowclusters(texts_bboxes_nontable)

In [None]:
ocr_tables = []
for tbl in tables:
    n_rows = max(e['row_id'] for e in tbl) + 1
    n_cols = max(e['col_id'] for e in tbl) + 1
    tbl_text = [[[] for _ in range(n_cols)] for _ in range(n_rows)]
    for cell in tbl:
        row_id = cell['row_id']
        col_id = cell['col_id']
        row_span = cell['row_span']
        col_span = cell['col_span']
        cell_text = []
        for rowcluster in cell['rowclusters']:
            for x1,y1,x2,y2 in rowcluster:
                cell_text.append(Process_VOCR_with_blank_margin(img_ocv[y1:y2,x1:x2]))
        for i1r in range(row_span):
            for i1c in range(col_span):
                tbl_text[row_id+i1r][col_id+i1c] = cell_text
    ocr_tables.append({
        "text": "\n".join(["| "+" | ".join([" ".join([e for e in col]) for col in row])+" |" for row in tbl_text]),
        "y1": min(e['bbox'][1] for e in tbl)
    })

In [None]:
ocr_nontables = []
for rowcluster in rowclusters_nontable:
    for x1,y1,x2,y2 in rowcluster:
        ocr_nontables.append({
            "text": Process_VOCR_with_blank_margin(img_ocv[y1:y2,x1:x2]),
            "y1": y1
        })

In [None]:
ocr_all = sorted(ocr_tables + ocr_nontables, key=lambda e: e['y1'])
ocr_text = "\n".join([e['text'] for e in ocr_all])
print(ocr_text)

# # # ---------------------------------------------------------------------------------------------------- Just to visualize
img_tmp = img_ocv.copy()
for tbl in tables:
    for cell in tbl:
        for i1, rowcluster in enumerate(cell['rowclusters']):
            for i2, (x1,y1,x2,y2) in enumerate(rowcluster):
                cv2.rectangle(img_tmp, (x1,y1), (x2,y2), (0,0,255), 2)
                cv2.putText(img_tmp, f"{i1}-{i2}", (x2,y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 2)
for i1, rowcluster in enumerate(rowclusters_nontable):
    for i2, (x1,y1,x2,y2) in enumerate(rowcluster):
        cv2.rectangle(img_tmp, (x1,y1), (x2,y2), (255,0,0), 2)
        cv2.putText(img_tmp, f"{i1}-{i2}", (x2,y1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 2)
UTILS.show_ocv(img_tmp)
# # # ----------------------------------------------------------------------------------------------------