# Import lib

In [1]:
import onnxruntime as ort
import numpy as np
from PIL import Image
import math
import yaml

# Define function and class helper

In [2]:
def resize(w, h, expected_height, image_min_width, image_max_width):
    new_w = int(expected_height * float(w) / float(h))
    round_to = 10
    new_w = math.ceil(new_w/round_to)*round_to
    new_w = max(new_w, image_min_width)
    new_w = min(new_w, image_max_width)

    return new_w, expected_height

def process_image(image, image_height, image_min_width, image_max_width):
    img = image.convert('RGB')
    w, h = img.size
    new_w, image_height = resize(w, h, image_height, image_min_width, image_max_width)
    img = img.resize((new_w, image_height), Image.LANCZOS)
    img = np.asarray(img).transpose(2,0, 1)
    img = img/255
    return img

def process_input(image, image_height, image_min_width, image_max_width):
    img = process_image(image, image_height, image_min_width, image_max_width)
    img = img[np.newaxis, ...]
    return img.astype(np.float32)

class Vocab():
    def __init__(self, chars):
        self.pad = 0
        self.go = 1
        self.eos = 2
        self.mask_token = 3

        self.chars = chars

        self.c2i = {c:i+4 for i, c in enumerate(chars)}

        self.i2c = {i+4:c for i, c in enumerate(chars)}
        
        self.i2c[0] = '<pad>'
        self.i2c[1] = '<sos>'
        self.i2c[2] = '<eos>'
        self.i2c[3] = '*'

    def encode(self, chars):
        return [self.go] + [self.c2i[c] for c in chars] + [self.eos]
    
    def decode(self, ids):
        first = 1 if self.go in ids else 0
        last = ids.index(self.eos) if self.eos in ids else None
        sent = ''.join([self.i2c[i] for i in ids[first:last]])
        return sent
    
    def __len__(self):
        return len(self.c2i) + 4
    
    def batch_decode(self, arr):
        texts = [self.decode(ids) for ids in arr]
        return texts

    def __str__(self):
        return self.chars


def translate_text(model_decoder, hidden, encoder_outputs, max_seq_length=128, sos_token=1, eos_token=2):

    translated_sentence = [[sos_token]*1]
    char_probs = [[1]*1]

    max_length = 0
    inputs = {}

    while max_length <= max_seq_length and not all(np.any(np.asarray(translated_sentence).T==eos_token, axis=1)):
        tgt_inp = np.array(translated_sentence, dtype=np.int64).T
        
        inputs["tgt_inp"]=tgt_inp
        inputs["hidden_input"]=hidden
        inputs["encoder_outputs"]=encoder_outputs
        output, hidden = model_decoder.run(None, inputs)
        
        output = np.exp(output - np.max(output, axis=-1, keepdims=True))  
        output /= np.sum(output, axis=-1, keepdims=True)
        
        top_values = np.partition(output, -5, axis=-1)[:, :, -5:]  
        top_indices = np.argsort(output, axis=-1)[:, :, -5:]  
        
        indices = top_indices[:, -1, -1]
        
        
        values = top_values[:, -1, -1]
        
        char_probs.append(values.tolist())
        translated_sentence.append(indices.tolist())   
        max_length += 1

        del output

    translated_sentence = np.asarray(translated_sentence).T
    char_probs = np.asarray(char_probs).T
    char_probs = np.multiply(char_probs, translated_sentence>3)
    char_probs = np.sum(char_probs, axis=-1)/(char_probs>0).sum(-1)
    
    return translated_sentence, char_probs

In [5]:
with open("../config/vietocr_seq2seq_config.yaml", encoding='utf-8') as f:
    config = yaml.safe_load(f)
        
vocab = Vocab(chars=config["vocab"])

# Inference

## Load image

In [7]:
image = Image.open("../asset/test.png")
image = process_input(image,
                        image_height=config['dataset']['image_height'], 
                        image_min_width=config['dataset']['image_min_width'], 
                        image_max_width=config['dataset']['image_max_width'])
print("image: ", image.shape, image.dtype)

image:  (1, 3, 32, 270) float32


## Encoder

In [3]:
vietocr_encoder = ort.InferenceSession(
        "../checkpoint/text_recognition_encoder.onnx",
        providers=[('CUDAExecutionProvider')]
    )
print("All input name: ")
for ip in vietocr_encoder.get_inputs():
    print(ip.name, ip.shape, ip.type)
print("All output name: ")
for op in vietocr_encoder.get_outputs():
    print(op.name, op.shape, op.type)

All input name: 
input_image ['batch_size', 3, 'height', 'width'] tensor(float)
All output name: 
hidden ['batch_size', 256] tensor(float)
encoder_outputs ['batch_size', 'src_len', 512] tensor(float)


[1;31m2025-05-11 23:36:21.035857273 [E:onnxruntime:Default, provider_bridge_ort.cc:2195 TryGetProviderInfo_CUDA] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1778 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn.so.9: cannot open shared object file: No such file or directory
[m
[0;93m2025-05-11 23:36:21.035879831 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:1055 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.[m


In [8]:
inputs = {}
inputs["input_image"]=image

hidden, encoder_outputs = vietocr_encoder.run(None, inputs)

print("hidden: ", hidden.shape)
print("encoder_outputs: ", encoder_outputs.shape)

hidden:  (1, 256)
encoder_outputs:  (1, 134, 512)


## Decoder

In [9]:
vietocr_decoder = ort.InferenceSession(
        "../checkpoint/text_recognition_decoder.onnx",
        providers=[('CUDAExecutionProvider')]
    )
print("All input name: ")
for ip in vietocr_decoder.get_inputs():
    print(ip.name, ip.shape, ip.type)
print("All output name: ")
for op in vietocr_decoder.get_outputs():
    print(op.name, op.shape, op.type)

All input name: 
tgt_inp ['batch_size', 'time_step'] tensor(int64)
hidden_input ['batch_size', 256] tensor(float)
encoder_outputs ['batch_size', 'src_len', 512] tensor(float)
All output name: 
output ['batch_size', 1, 233] tensor(float)
hidden_output ['batch_size', 256] tensor(float)


[1;31m2025-05-11 23:38:52.384242940 [E:onnxruntime:Default, provider_bridge_ort.cc:2195 TryGetProviderInfo_CUDA] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1778 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn.so.9: cannot open shared object file: No such file or directory
[m
[0;93m2025-05-11 23:38:52.384280654 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:1055 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.[m


In [10]:
s, prob = translate_text(model_decoder=vietocr_decoder, 
                            hidden=hidden, 
                            encoder_outputs=encoder_outputs)
print(s)
translated_sentence = s[0].tolist()
text = vocab.decode(translated_sentence)
print("Text: ", text)

[[  1  97  26 144 232  76   6  98  74 232  40  12  98 232 207  77 100  26
   42 232  98  74   6  98  76 232  98  74  76  62 232  92  78  98  76 232
   44 100   4  98  76 208   2]]
Text:  Mặt hàng bán (Hoặc ngành nghề kinh doanh)
