In [None]:
!pip install python-doctr -q
!pip install mplcursors -q

In [None]:
# download an image with text
!wget https://github.com/opencv/opencv/blob/master/samples/data/imageTextN.png?raw=true -O sample.jpg
#!wget https://www.robots.ox.ac.uk/~vgg/software/textspot/text.png -O sample2.jpg

In [None]:
import doctr
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from doctr.utils.visualization import visualize_page
import numpy as np
import cv2
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt

In [None]:
cv2_imshow(cv2.imread("sample.jpg"))

In [None]:
doc = DocumentFile.from_images("sample.jpg")

In [None]:
model = ocr_predictor(
    det_arch="db_resnet50",
    reco_arch="crnn_vgg16_bn",
    pretrained=True
)

In [None]:
model.eval()

In [None]:
result = model(doc)

In [None]:
print(result.pages[0].render())

In [None]:
result.pages[0].export()

In [None]:
# visualize page
visualize_page(result.pages[0].export(), doc[0])

In [None]:
# in details

In [None]:
import torch
import torchvision
import math
import matplotlib.pyplot as plt

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
detector = model.det_predictor.model.to(device)

In [None]:
image = cv2.imread("sample.jpg")

In [None]:
# detect preprocessing
def functional_resize(img, size=(1024,1024), preserve_aspect_ratio=True, symmetric_pad=True):
    target_ratio = size[0] / size[1]
    actual_ratio = img.shape[-2] / img.shape[-1]

    if not preserve_aspect_ratio or (target_ratio == actual_ratio):
        img = torchvision.transforms.functional.resize(img, size, interpolation=torchvision.transforms.InterpolationMode.BILINEAR, antialias=True)
        _pad = (0, 0, 0, 0)
    else:
        # Resize
        if actual_ratio > target_ratio:
            tmp_size = (size[0], max(int(size[0] / actual_ratio), 1))
        else:
            tmp_size = (max(int(size[1] * actual_ratio), 1), size[1])

        # Scale image
        img = torchvision.transforms.functional.resize(img, tmp_size, interpolation=torchvision.transforms.InterpolationMode.BILINEAR, antialias=True)
        raw_shape = img.shape[-2:]

        # Pad (inverted in pytorch)
        _pad = (0, size[1] - img.shape[-1], 0, size[0] - img.shape[-2])
        if symmetric_pad:
            half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2))
            _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1])
        # Pad image
        img = torch.nn.functional.pad(img, _pad)

    return img, _pad

def functional_normalize(input_tensor, mean,std):
    mean = torch.tensor(mean).view(-1, 1, 1)
    std = torch.tensor(std).view(-1, 1, 1)
    return (input_tensor - mean) / std

def detector_preprocess(page, size=(1024,1024)):
    x = torch.from_numpy(page).permute(2, 0, 1)
    x, padding = functional_resize(x,(1024,1024))
    x = x.to(dtype=torch.float32).div(255).clip(0, 1)
    x = functional_normalize(x,mean=(0.798, 0.785, 0.772),std=(0.264, 0.2749, 0.287))
    x = x.unsqueeze(0)
    return x, padding

blob, padding = detector_preprocess(image)

print(image.shape, blob.shape, padding)

In [None]:
# detect (classify each pixel for being a piece of text, apply threshold and geometry)
inp = blob.to(device)
feats = detector.feat_extractor(inp)
feats = [feats[str(idx)] for idx in range(len(feats))]
feat_concat = detector.fpn(feats)
logits = detector.prob_head(feat_concat)
prob_map = torch.sigmoid(logits)

In [None]:
# detector backbone
detector.feat_extractor

In [None]:
[feat.shape for feat in feats]

In [None]:
# detector Feature Pyramid Network (FPN)
detector.fpn

In [None]:
feat_concat.shape

In [None]:
detector.prob_head

In [None]:
logits.shape, logits.min().item(), logits.max().item()

In [None]:
prob_map.shape, prob_map.min().item(), prob_map.max().item()

In [None]:
cv2_imshow((prob_map[0].squeeze(0).detach().cpu().numpy()*255).astype(np.uint8))

In [None]:
# postprocess
def polygon_to_box(points, unclip_ratio = 1.44):

    # Ensure the polygon is in float for accuracy
    points = points.astype(np.float32)

    area = cv2.contourArea(points)
    length = cv2.arcLength(points, closed=True)

    if length == 0:
        return None

    distance = area * unclip_ratio / length  # similar to original

    # Compute normal directions for each edge
    expanded = []
    num_points = len(points)

    for i in range(num_points):
        p1 = points[i]
        p2 = points[(i + 1) % num_points]
        edge = p2 - p1
        edge_length = np.linalg.norm(edge)

        if edge_length == 0:
            continue

        # Normal vector to the edge (clockwise)
        normal = np.array([-edge[1], edge[0]]) / edge_length
        expanded.append(normal)

    # Average normals for each vertex
    normals = []
    for i in range(num_points):
        n1 = expanded[i - 1]
        n2 = expanded[i]
        avg_normal = (n1 + n2)
        norm = np.linalg.norm(avg_normal)
        if norm != 0:
            avg_normal = avg_normal / norm
        normals.append(avg_normal)

    # Offset each point by the average normal * distance
    offset_points = points + distance * np.array(normals, dtype=np.float32)

    # Return bounding rect of expanded polygon
    offset_points_int = np.round(offset_points).astype(np.int32)
    return cv2.boundingRect(offset_points_int)

def box_score(pred, points, assume_straight_pages = True):
    # Compute the confidence score for a polygon : mean of the p values on the polygon
    h, w = pred.shape[:2]
    xmin = np.clip(np.floor(points[:, 0].min()).astype(np.int32), 0, w - 1)
    xmax = np.clip(np.ceil(points[:, 0].max()).astype(np.int32), 0, w - 1)
    ymin = np.clip(np.floor(points[:, 1].min()).astype(np.int32), 0, h - 1)
    ymax = np.clip(np.ceil(points[:, 1].max()).astype(np.int32), 0, h - 1)
    return pred[ymin : ymax + 1, xmin : xmax + 1].mean()

def bitmap_to_boxes(pred, bitmap, box_thresh = 0.1, unclip_ratio = 1.7):
    # get contours from connected components on the bitmap
    contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    boxes = []
    for contour in contours:
        # Check whether smallest enclosing bounding box is not too small
        if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < 2):
            continue
        # Compute objectness
        x, y, w, h = cv2.boundingRect(contour)
        points: np.ndarray = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
        score = box_score(pred, points, assume_straight_pages=True)
        if score < box_thresh:  # remove polygons with a weak objectness
            continue

        _box = polygon_to_box(points, unclip_ratio)

        # compute relative polygon to get rid of img shape
        x, y, w, h = _box
        xmin, ymin, xmax, ymax = x, y, x + w, y + h
        boxes.append([xmin, ymin, xmax, ymax, score])

    return boxes

def detector_postprocess(out_map, padding, size, bin_thresh = 0.1, box_thresh = 0.1, unclip_ratio = 1.7):
    height, width = size
    prob = out_map.squeeze(0).detach().cpu().numpy()
    bitmap = (prob >= bin_thresh).astype(np.uint8)
    bitmap = cv2.morphologyEx(bitmap, cv2.MORPH_OPEN, np.ones((3, 3), dtype=np.uint8))
    H, W = bitmap.shape
    _boxes = bitmap_to_boxes(prob, bitmap, box_thresh=box_thresh, unclip_ratio=unclip_ratio)
    l, r, t, b = padding
    h, w = H - t - b, W - l - r
    boxes = []
    for _box in _boxes:
        objectness = _box[4]
        xmin, ymin, xmax, ymax = np.int32(np.round( (_box[:4] - np.array([l, t, l, t])) * np.array([width/w, height/h, width/w, height/h]) ))
        box = [ max(xmin-1,0), max(ymin,0), min(xmax+1,width-1), min(ymax+1,height-1), objectness ]
        boxes.append(box)
    bitmap = cv2.resize(bitmap[t:H-b,l:W-r], (size[1],size[0]))
    return boxes, bitmap*255

boxes, bitmap = detector_postprocess(prob_map[0], padding, image.shape[:2])

In [None]:
cv2_imshow(bitmap)

In [None]:
disp = cv2.cvtColor(bitmap,cv2.COLOR_GRAY2BGR)
for box in boxes:
    cv2.rectangle(disp,(box[0],box[1]),(box[2],box[3]),(0,255,0),2)
cv2_imshow(disp)

In [None]:
# recognizer (RNN)

In [None]:
index = 0
box, confidence = np.int32(boxes[index][:4]), float(boxes[index][4])
print(box,confidence)

In [None]:
crop = image[box[1]:box[3]+1,box[0]:box[2]+1]
print(crop.shape)

In [None]:
cv2_imshow(crop)

In [None]:
rnn = model.reco_predictor
rnn.to(device)

In [None]:
# recognition preprocesing
batch = rnn.pre_processor([crop])[0].to(device)
batch.shape, batch.min().item(), batch.max().item()

In [None]:
# recognition
features = rnn.model.feat_extractor(batch)
b, c, h, w = features.shape
features_seq = torch.reshape(features, shape=(-1, h * c, w))
features_seq = torch.transpose(features_seq, 1, 2)
hidden_states, _ = rnn.model.decoder(features_seq)
logits = rnn.model.linear(hidden_states)

In [None]:
rnn.model.feat_extractor

In [None]:
features.shape

In [None]:
features_seq.shape

In [None]:
hidden_states.shape

In [None]:
rnn.model.linear

In [None]:
rnn.model.linear.bias.abs().max().item()

In [None]:
logits.shape

In [None]:
logits

In [None]:
# recogniton postprocessing

In [None]:
candidates = logits[0].softmax(dim=-1).argmax(dim=-1)
candidates

In [None]:
len(candidates)

In [None]:
logits[0].softmax(dim=-1).max(dim=-1).values

In [None]:
confidence = logits[0].softmax(dim=-1).max(dim=-1).values.min(dim=-1).values.item()
confidence

In [None]:
rnn.model.postprocessor

In [None]:
rnn.model.postprocessor.vocab

In [None]:
len(rnn.model.postprocessor.vocab)

In [None]:
ids = candidates[candidates < len(rnn.model.postprocessor.vocab)].tolist()
ids

In [None]:
result = ''.join(rnn.model.postprocessor.vocab[i] for i in ids)
result

In [None]:
# more sophisticated recogniton postprocessing (CTC)
result = rnn.model.postprocessor(logits)

In [None]:
result