## 1. libraries

In [1]:
import torch
import torchvision
import onnx
import onnxruntime as ort
from torchvision import transforms as T
from PIL import Image
import os
from tqdm import tqdm
import time
preprocess_parseq = T.Compose([
            T.Resize((32, 128), T.InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(0.5, 0.5)
        ])

In [2]:
def main(batch_size, device_type = "cpu", img_folder = "/home/ubuntu/parseq/demo_images"):
    if device_type == "cpu":
        device = torch.device("cpu")
        parseq = torch.hub.load('baudm/parseq', 'parseq', pretrained=True).eval()
    elif device_type == "cuda":
        device = torch.device("cuda:0")
        parseq = torch.hub.load('baudm/parseq', 'parseq', pretrained=True).eval().cuda()

    test_img_path = "demo_images/art-01107.jpg"
    img = Image.open(test_img_path).convert('RGB')
    # Preprocess. Model expects a batch of images with shape: (B, C, H, W)
    img = preprocess_parseq(img.convert('RGB')).unsqueeze(0).to(device)

    logits = parseq(img)
    logits.shape  # torch.Size([1, 26, 95]), 94 characters + [EOS] symbol

    # Greedy decoding
    pred = logits.softmax(-1)
    label, confidence = parseq.tokenizer.decode(pred)
    print(confidence)
    print('Decoded label = {}'.format(label[0]))

    dummy_input = torch.rand(batch_size, 3, 32, 128) 

    output_path = "parseq_{}_torchscript.pt"

    # To ONNX
    parseq.to_torchscript(file_path=output_path, method="script", example_inputs=dummy_input)  # opset v14 or newer is required
    
    ts_model = torch.jit.load(output_path)
    logits = ts_model(img)
    logits.shape  # torch.Size([1, 26, 95]), 94 characters + [EOS] symbol
    print(logits.shape)
    # Greedy decoding
    pred = logits.softmax(-1)
    label, confidence = parseq.tokenizer.decode(pred)
    print('Decoded label = {}'.format(label[0]))

In [2]:
def main_batch(batch_size=1, device_type="cuda", img_folder="/home/ubuntu/parseq/digits_demo", img_type="jpg"):
    if device_type == "cpu":
        device = torch.device("cpu")
        parseq = torch.hub.load('baudm/parseq', 'parseq', pretrained=True).eval()
    elif device_type == "cuda":
        device = torch.device("cuda:0")
        parseq = torch.hub.load('baudm/parseq', 'parseq', pretrained=True).eval().cuda()

    test_img_path = "demo_images/art-01107.jpg"
    img = Image.open(test_img_path).convert('RGB')
    # Preprocess. Model expects a batch of images with shape: (B, C, H, W)
    img = preprocess_parseq(img.convert('RGB')).unsqueeze(0).to(device)

    logits = parseq(img)
    logits.shape  # torch.Size([1, 26, 95]), 94 characters + [EOS] symbol

    # Greedy decoding
    pred = logits.softmax(-1)
    label, confidence = parseq.tokenizer.decode(pred)
    print(confidence)
    print('Decoded label = {}'.format(label[0]))

    dummy_input = torch.rand(batch_size, 3, 32, 128) 

    output_path = "parseq_{}_b{}_torchscript.pt".format(device_type, batch_size)

    # To ONNX
    parseq.to_torchscript(file_path=output_path, method="script", example_inputs=dummy_input)  # opset v14 or newer is required
    
    ts_model = torch.jit.load(output_path)
    logits = ts_model(img)
    logits.shape  # torch.Size([1, 26, 95]), 94 characters + [EOS] symbol
    print(logits.shape)
    # Greedy decoding
    pred = logits.softmax(-1)
    label, confidence = parseq.tokenizer.decode(pred)
    print('Decoded label = {}'.format(label[0]))
    c = 0
    x = 0
    if os.path.exists(img_folder):
        for filename in tqdm(os.listdir(img_folder)):
            if filename.endswith(img_type):
                x += 1
                img_path = os.path.join(img_folder, filename)
                # print(img_path)
                
                img_input = Image.open(img_path).convert('RGB')
                # Preprocess. Model expects a batch of images with shape: (B, C, H, W)
                img_input = preprocess_parseq(img_input.convert('RGB')).unsqueeze(0).to(device)
                # print(img_input)
                with torch.no_grad():  
                    logits_ts = ts_model(img_input)
                    logits = parseq(img_input)

                pred = logits_ts.softmax(-1)
                label_ts, confidence = parseq.tokenizer.decode(pred)
                # print(confidence)
                # print('Decoded label = {}'.format(label_ts[0]))
                
                pred = logits.softmax(-1)
                label, confidence = parseq.tokenizer.decode(pred)
                # print(confidence)
                # print('Decoded label = {}'.format(label[0]))
                
                
                if label[0] == label_ts[0]:
                    print("matched")
                    c += 1
                else:
                    print(label[0])
                    print(label_ts[0])
                    # c += 1
    print(c, x)
                
                # reading = "".join([label[0][i] for i in range(len(label[0])) if confidence[0][i] > eps])
                # print(reading)
                # if len(reading) > 0:
                #     outfilename = os.path.join(img_path.replace(img_type, "txt"))
                #     print(outfilename)
                #     outfile = open(outfilename, "w")
                #     outfile.write(reading)
                #     outfile.close()
                # return reading

In [3]:
main_batch(batch_size=16)

Using cache found in /home/ubuntu/.cache/torch/hub/baudm_parseq_main


[tensor([0.9979, 0.9996, 0.9999, 0.9998, 0.9868, 0.9998, 0.9432, 0.9765, 0.9937,
        0.9981], device='cuda:0', grad_fn=<SliceBackward0>)]
Decoded label = CHEWBACCA


  " but it is a non-constant {}. Consider removing it.".format(name, hint))
  return forward_call(*input, **kwargs)


torch.Size([1, 26, 95])
Decoded label = CHEWBACCA


To debug try disable codegen fallback path via setting the env variable `export PYTORCH_NVFUSER_DISABLE=fallback`
 (Triggered internally at  ../torch/csrc/jit/codegen/cuda/manager.cpp:329.)
  return forward_call(*input, **kwargs)
  1%|▏         | 1/73 [00:03<04:09,  3.46s/it]

matched


  8%|▊         | 6/73 [00:08<01:10,  1.05s/it]

matched
matched
matched
matched
matched
matched
matched


 19%|█▉        | 14/73 [00:09<00:18,  3.23it/s]

matched
matched
matched
matched
matched
matched
matched


 30%|███       | 22/73 [00:09<00:07,  7.03it/s]

matched
matched
matched
matched
matched
matched
matched


 41%|████      | 30/73 [00:09<00:03, 12.55it/s]

matched
matched
matched
matched
matched
matched
matched
matched


 47%|████▋     | 34/73 [00:09<00:02, 15.69it/s]

matched
matched
matched
matched
matched
matched
matched


 58%|█████▊    | 42/73 [00:09<00:01, 21.60it/s]

matched
matched
matched
matched
matched
matched
matched


 68%|██████▊   | 50/73 [00:10<00:00, 26.02it/s]

matched
matched
matched
matched
matched
matched
matched


 79%|███████▉  | 58/73 [00:10<00:00, 29.26it/s]

matched
matched
matched
matched
matched
matched
matched


 85%|████████▍ | 62/73 [00:10<00:00, 30.13it/s]

matched
matched
matched
matched
matched
matched
matched


 96%|█████████▌| 70/73 [00:10<00:00, 31.91it/s]

matched
matched
matched
matched
matched
matched
matched


100%|██████████| 73/73 [00:10<00:00,  6.75it/s]

matched
73 73





In [23]:
66//8

8

In [52]:
batch_size = 16
parseq = torch.hub.load('baudm/parseq', 'parseq', pretrained=True).eval()

output_path = "parseq_{}_b{}_torchscript.pt".format("cpu", batch_size)
ts_model = torch.jit.load(output_path)
img_folder = "/home/ubuntu/parseq/digits_demo"

input_holder = torch.zeros(batch_size, 3, 32, 128)

img_paths = [os.path.join(img_folder, x) for x in os.listdir(img_folder) if x.endswith("jpg")]

N = len(img_paths)
n_batch = N//batch_size + 1

for i in range(n_batch):
    # print(list(range((i-1)*batch_size,batch_size*i)))
    if batch_size*(i+1) < N:
        input_holder = torch.zeros(batch_size, 3, 32, 128)
        img_paths_tmp = img_paths[(i)*batch_size:batch_size*(i+1)]
    else:
        input_holder = torch.zeros(N-(n_batch-1)*batch_size, 3, 32, 128)
        img_paths_tmp = img_paths[(i)*batch_size:N]
    
    print(input_holder.size())
    for j in range(len(img_paths_tmp)):
                
        img_input = Image.open(img_paths_tmp[j]).convert('RGB')
        img_input = preprocess_parseq(img_input.convert('RGB'))
        # Preprocess. Model expects a batch of images with shape: (B, C, H, W)
        
        input_holder[j, :, :, :] = img_input[:,:,:]
        
    with torch.no_grad():  
        start = time.time()
        logits_ts = ts_model(input_holder)
        end_1 = time.time()
        logits = parseq(input_holder)
        end_2 = time.time()

    # print("parseq")
    # print(end_2-end_1)
    # print("torchscript")
    # print(end_1-start)
    pred = logits.softmax(-1)
    label, confidence = parseq.tokenizer.decode(pred)
        
    pred = logits_ts.softmax(-1)
    label_ts, confidence = parseq.tokenizer.decode(pred)
    
    print(label_ts, label)



Using cache found in /home/ubuntu/.cache/torch/hub/baudm_parseq_main


torch.Size([16, 3, 32, 128])
parseq
0.5632619857788086
torchscript
1.0032198429107666
['448', '398', '464', '422', '464', '597', '3.34', '3.77', '597', "'13N", '498', '647', '314', '4#', '224', '348'] ['448', '398', '464', '422', '464', '597', '3.34', '3.77', '597', "'13N", '498', '647', '314', '4#', '224', '348']
torch.Size([16, 3, 32, 128])
parseq
0.5741300582885742
torchscript
18.885210752487183
['464', '427', '698', '464', '458', '558', '1087', '464', '297', '472', '298', '98', '674', '387', '422', '467'] ['464', '427', '698', '464', '458', '558', '1087', '464', '297', '472', '298', '98', '674', '387', '422', '467']
torch.Size([16, 3, 32, 128])
parseq
0.5603926181793213
torchscript
0.5546107292175293
['538', '538', '547', '397', '474', '497', '997', '497', '224', '417', '417', '538', '497', '494', '497', '$496'] ['538', '538', '547', '397', '474', '497', '997', '497', '224', '417', '417', '538', '497', '494', '497', '$496']
torch.Size([16, 3, 32, 128])
parseq
0.560748815536499
torc

In [46]:
len(img_paths)

73