### TODO

* Detect the text on the actual image.
* Detect the text on the white background image.
* Check if there is size difference.
* Make the bounding boxes the same size by expanding the smaller one.
* Than find the mapping function according to the actual image canvas.

In [92]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

import os
from utils import *

from paddleocr import PaddleOCR,draw_ocr

from PIL import Image, ImageFont
import numpy as np
import cv2
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from matplotlib.colors import hsv_to_rgb, rgb_to_hsv

from difflib import SequenceMatcher
# Paddleocr supports Chinese, English, French, German, Korean and Japanese.
# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan`
# to switch the language model in order.
def compose_paragraphs(text_bboxes, text_palettes):

        '''
            Compose text data into paragraphs.
            Return: Grouped indices of detected text elements.
        '''
        num_text_boxes = len(text_palettes)
        print(num_text_boxes)
        composed_text_idxs = [[0]]
        for i in range(num_text_boxes-1):
            palette1 = text_palettes[i]
            palette2 = text_palettes[i+1]
            if np.array_equal(palette1, palette2):
                bbox1 = text_bboxes[i]
                bbox2 = text_bboxes[i+1]
                height1 = bbox1[0][1] - bbox1[3][1]
                height2 = bbox2[0][1] - bbox2[3][1]
                if abs(bbox1[0][1]-bbox2[0][1]) <= abs(height1)+30:
                    if i != 0 and i not in composed_text_idxs[-1]:
                        composed_text_idxs.append([i])
                    composed_text_idxs[-1].append(i+1)
                else:
                    if i != 0 and i not in composed_text_idxs[-1] and [i] not in composed_text_idxs:
                        composed_text_idxs.append([i])
                    if i == num_text_boxes-2:
                        composed_text_idxs.append([i+1])
            else:
                if i != 0 and i not in composed_text_idxs[-1]:
                    composed_text_idxs.append([i])
                if i == (num_text_boxes-2):
                    composed_text_idxs.append([i+1])
        return composed_text_idxs

def merge_bounding_boxes(composed_text_idxs, bboxes):
        '''
            openCV --> x: left-to-right, y: top--to-bottom
            bbox coordinates --> [[256.0, 1105.0], [1027.0, 1105.0], [1027.0, 1142.0], [256.0, 1142.0]]
                             --> left top, right top, right bottom, left bottom

            TODO: Also return color palettes for each merged box.
        '''
        
        biggest_borders = []
        for idxs in composed_text_idxs:
            smallest_x = smallest_y = 10000
            biggest_y = biggest_x = 0
            if len(idxs) > 1:
                for idx in idxs:
                    bbox = bboxes[idx]
                    bbox_smallest_x, bbox_smallest_y = np.min(bbox, axis=0)
                    bbox_biggest_x, bbox_biggest_y = np.max(bbox, axis=0)

                    if smallest_x > bbox_smallest_x:
                        smallest_x = bbox_smallest_x
                    if smallest_y > bbox_smallest_y:
                        smallest_y = bbox_smallest_y
                    if biggest_x < bbox_biggest_x:
                        biggest_x = bbox_biggest_x
                    if biggest_y < bbox_biggest_y:
                        biggest_y =  bbox_biggest_y

                biggest_border = [[smallest_x, smallest_y], [biggest_x, smallest_y], [biggest_x, biggest_y], [smallest_x, biggest_y]]
                biggest_borders.append(biggest_border)
            else:
                biggest_borders.append(bboxes[idxs[0]])
        return biggest_borders

def extract_text_bbox(ocr, img_path, preview_image_path):
        '''
            Input: path to the text image
            Extract text using paddleOCR.
            Crop text from bounding box.
            Extract colors using Kmeans inside the bbox.
            Return the dominant color and the position.
            
            DONE: Try to combine very close lines as paragraph bbox. 
            If the the distance between two bbox is smaller than the bbox height and color is the same,
            we can group them as paragraphs.

            TODO: Cut images automatically from the sides by a margin.
            When constructing bounding boxes, add these margins back to the coordinates.
            Sometimes texts are extremely small that the model cannot detect.

            Return: text color palettes, dominant colors for each text and position list (as bboxes).
        '''
        # Parameters for KMeans.
        n_colors = 3

        result = ocr.ocr(img_path, cls=True)[0]

        image = Image.open(img_path).convert('RGB')
        boxes = [line[0] for line in result]
        texts = [line[1][0] for line in result]
        image = cv2.imread(img_path)
        preview_image = cv2.imread(preview_image_path)

        palettes = []
        dominants = []
        new_bboxes = []

        # Run KMeans for each text object
        for bbox in boxes:
            # Crop the text area
            x, y = int(bbox[0][0]), int(bbox[0][1])
            z, t = int(bbox[2][0]), int(bbox[2][1])
            cropped_image = image[y:t, x:z]

            # Do template matching to find the places at the actual image because not every image has the same size.
            method = cv2.TM_SQDIFF_NORMED
            result = cv2.matchTemplate(cropped_image, preview_image, method)
            mn,_,mnLoc,_ = cv2.minMaxLoc(result)
            MPx,MPy = mnLoc
            trows,tcols = cropped_image.shape[:2]
            # --> left top, right top, right bottom, left bottom
            bbox = [[MPx,MPy], [MPx+tcols, MPy], [MPx+tcols, MPy+trows], [MPx, MPy+trows]]

            # Apply KMeans to the text area
            criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1)
            flags = cv2.KMEANS_RANDOM_CENTERS

            pixels = np.float32(cropped_image.reshape(-1, 3))
            _, labels, palette = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)
            palette = np.asarray(palette, dtype=np.int64)
            palette_w_white = []

            for i, color in enumerate(palette):
                x, y, z = color
                # Do not add white to the palette since it is the same background in every pic.
                if not (252 < x < 256 and 252 < y < 256 and 252 < z < 256):
                    palette_w_white.append(color)
                else:
                    labels = np.delete(labels, np.where(labels == i))

            _, counts = np.unique(labels, return_counts=True)
            dominant = palette_w_white[np.argmax(counts)]
            palettes.append(palette_w_white)
            dominants.append(dominant)
            new_bboxes.append(bbox)

        return palettes, dominants, boxes, texts
        
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def extract_text_directly(ocr, img_path, white_bg_texts):
    n_colors = 3

    result = ocr.ocr(img_path, cls=True)[0]

    image = Image.open(img_path).convert('RGB')
    boxes = [line[0] for line in result]
    texts = [line[1][0].replace(" ", "") for line in result]
    white_bg_texts = [elem.replace(" ", "") for elem in white_bg_texts]
    image = cv2.imread(img_path)
    same_idxs = []
    new_boxes = []
    
    for elem in white_bg_texts:
        for i, text in enumerate(texts):
            if similar(elem, text) > 0.85:
                new_boxes.append(boxes[i])

    for bbox in new_boxes:
        x, y = int(bbox[0][0]), int(bbox[0][1])
        z, t = int(bbox[2][0]), int(bbox[2][1])
        cropped_image = image[y:t, x:z]

    return new_boxes

def extract_decor_elements(decoration_path, preview_path):
        # Determine the number of dominant colors
        num_colors = 6
        
        # Load the image
        image = cv2.imread(decoration_path)
        preview_image = cv2.imread(preview_path)
        
        # Convert the image to the RGB color space
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image2 = image.copy()
        
        # Reshape the image to a 2D array of pixels
        pixels = image.reshape(-1, 3)
        
        # Apply K-means clustering with the determined number of colors
        kmeans = KMeans(n_clusters=num_colors)
        kmeans.fit(pixels)
        
        # Get the RGB values of the dominant colors
        colors = kmeans.cluster_centers_.astype(int)
        print("Num of colors: ", len(colors))
        
        # Convert the colors to the HSV color space
        hsv_colors = [] 

        for i, color in enumerate(colors):
            x, y, z = color
            if not (252 < x < 256 and 252 < y < 256 and 252 < z < 256):
                x, y, z = rgb_to_hsv([x/255, y/255, z/255])
                hsv_colors.append([x*180, y*255, z*255])
        # Convert the image to the HSV color space
        hsv_image = cv2.cvtColor(image2, cv2.COLOR_RGB2HSV)
        
        # Create masks for each dominant color
        masks = []
        hsv_colors = np.asarray(hsv_colors, dtype=np.int32)
        
        colors = []
        for i in range(len(hsv_colors)):
            
            h, s, v = hsv_colors[i, :]
            lower_color = hsv_colors[i, :] - np.array([10, 50, 50])
            upper_color = hsv_colors[i, :] + np.array([10, 255, 255])
            mask = cv2.inRange(hsv_image, lower_color, upper_color)
            colors.append([h,s,v])
            masks.append(mask)
        
        # Find contours in each mask
        contours = []
        for mask in masks:
            contours_color, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
            contours.append(contours_color)
        
        # Draw bounding boxes around the shapes
        image_with_boxes = image.copy()
        bboxes = []
        for i, contour_color in enumerate(contours):
            for contour in contour_color:
                x, y, w, h = cv2.boundingRect(contour)
                # left top, right top, right bottom, left bottom
                bboxes.append([[x,y], [x+w, y], [x+w, y+h], [x,y+h]])

        new_bboxes = NMS(np.asarray(bboxes))
        correct_bboxes = []
        border_size=1
        for bbox in new_bboxes:
            [[x,y], [z, y], [z, t], [x, t]] = bbox
            #cv2.rectangle(image_with_boxes, (x, y), (z, t), (0, 255, 0), 2)
            cropped_image = image[y:t, x:z]
            cropped_image = cv2.copyMakeBorder(
                cropped_image,
                top=border_size,
                bottom=border_size,
                left=border_size,
                right=border_size,
                borderType=cv2.BORDER_CONSTANT,
                value=[255, 255, 255]
            )

            method = cv2.TM_SQDIFF_NORMED
            result = cv2.matchTemplate(cropped_image, preview_image, method)
            mn,_,mnLoc,_ = cv2.minMaxLoc(result)
            MPx,MPy = mnLoc
            trows,tcols = cropped_image.shape[:2]
            bbox = [[MPx+1,MPy-1], [MPx+tcols+1,MPy-1], [MPx+tcols+1, MPy+trows-1], [MPx+1, MPy+trows-1]]
            correct_bboxes.append(bbox)
        
        return colors, bboxes

def map_decoration_coordinates(design_text_coordinate, text_coordinate, decoration_coordinates, prev_size, text_size):
    # --> [[256.0, 1105.0], [1027.0, 1105.0], [1027.0, 1142.0], [256.0, 1142.0]]
    # --> left top, right top, right bottom, left bottom

    prev_x, prev_y = prev_size
    print(prev_size, text_size)
    text_x, text_y = text_size

    design_x, design_y = design_text_coordinate[0]
    text_x, text_y = text_coordinate[0]
    print("TEXT X")
    print(text_x)
    print("DESIGN X")
    print(design_x)

    diff_x = text_x - design_x
    diff_y = text_y - design_y
    
    new_coordinates = []
    for coordinate in decoration_coordinates:
        new_coor = []
        for elem in coordinate:
            print("ELEM X")
            print(elem[0])
            new_coor.append([elem[0]-diff_x, elem[1]-diff_y])
        new_coordinates.append(new_coor)

    return new_coordinates

ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory

[2023/07/05 15:33:04] ppocr DEBUG: Namespace(alpha=1.0, benchmark=False, beta=1.0, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/Users/busraasan/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/Users/busraasan/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_score_thresh=0.

In [93]:
# DETECT TEXT ON TEXT IMAGE
text = '../destijl_dataset/04_text/0007.png'
preview = '../destijl_dataset/00_preview/0007.png'
decoration = '../destijl_dataset/03_decoration/0007.png'
image = cv2.imread(preview)
image_text = cv2.imread(text)
image2 = image.copy()

text_palettes, text_dominants, text_bboxes, texts = extract_text_bbox(ocr, text, preview)
composed_text_idxs = compose_paragraphs(text_bboxes, text_palettes)
merged_bboxes = merge_bounding_boxes(composed_text_idxs, text_bboxes)

# DETECT TEXT ON REAL DESIGN
image = cv2.imread(preview)
image2 = image.copy()
image3 = image.copy()

text_bboxes1 = extract_text_directly(ocr, preview, texts)
composed_text_idxs1 = compose_paragraphs(text_bboxes1, text_palettes)
merged_bboxes1 = merge_bounding_boxes(composed_text_idxs1, text_bboxes1)
decoration_hsv_palettes, decoration_bboxes = extract_decor_elements(decoration, preview)
new_coordinates = map_decoration_coordinates(text_bboxes1[0], text_bboxes[0], decoration_bboxes, (image.shape[0], image.shape[1]), (image_text.shape[0], image_text.shape[1]))

print(new_coordinates)
# for bbox in text_bboxes1:
#     x, y = bbox[0][0], bbox[0][1]
#     z, t = bbox[2][0], bbox[2][1]
#     cv2.rectangle(image, (int(x), int(y)), (int(z), int(t)), (0, 255, 0), 2)
    
# cv2.imwrite('result_prev.jpg', image)

# for bbox in text_bboxes:
#     x, y = bbox[0][0], bbox[0][1]
#     z, t = bbox[2][0], bbox[2][1]
#     cv2.rectangle(image2, (int(x), int(y)), (int(z), int(t)), (0, 255, 0), 2)
    
# cv2.imwrite('result_text.jpg', image2)

for bbox in new_coordinates:
    x, y = bbox[0][0], bbox[0][1]
    z, t = bbox[2][0], bbox[2][1]
    cv2.rectangle(image3, (int(x), int(y)), (int(z), int(t)), (0, 255, 0), 2)
    
cv2.imwrite('result_dec.jpg', image3)

[2023/07/05 15:33:07] ppocr DEBUG: dt_boxes num : 7, elapse : 0.9603531360626221
[2023/07/05 15:33:07] ppocr DEBUG: cls num  : 7, elapse : 0.08154010772705078
[2023/07/05 15:33:09] ppocr DEBUG: rec_res num  : 7, elapse : 1.3612589836120605
7
[2023/07/05 15:33:10] ppocr DEBUG: dt_boxes num : 9, elapse : 0.4151937961578369
[2023/07/05 15:33:11] ppocr DEBUG: cls num  : 9, elapse : 0.0915839672088623
[2023/07/05 15:33:12] ppocr DEBUG: rec_res num  : 9, elapse : 1.6123747825622559
7
Num of colors:  6
(1902, 1068) (3188, 3596)
TEXT X
989.0
DESIGN X
124.0
ELEM X
1821
ELEM X
1910
ELEM X
1910
ELEM X
1821
ELEM X
895
ELEM X
1071
ELEM X
1071
ELEM X
895
ELEM X
1748
ELEM X
1910
ELEM X
1910
ELEM X
1748
ELEM X
1366
ELEM X
1425
ELEM X
1425
ELEM X
1366
ELEM X
1366
ELEM X
1425
ELEM X
1425
ELEM X
1366
ELEM X
855
ELEM X
931
ELEM X
931
ELEM X
855
ELEM X
855
ELEM X
931
ELEM X
931
ELEM X
855
ELEM X
1913
ELEM X
1935
ELEM X
1935
ELEM X
1913
ELEM X
855
ELEM X
923
ELEM X
923
ELEM X
855
ELEM X
1877
ELEM X
1935
ELE

True