In [1]:
"""
Application to provide benchmark timers for code. 
Usage: 
# from my_timer_class import MyTimer
from my_timer_func import my_timer
import time

@MyTimer3(name="decorator")
@my_timer
"""

import functools
import time

def my_timer(orig_func):
    import time
    @functools.wraps(orig_func)
    def wrapper_timer(*args, **kwargs):
        tic = time.perf_counter()
        value = orig_func(*args, **kwargs)
        toc = time.perf_counter()
        elapsed_time = toc - tic
        print(f"Elapsed time to run {orig_func.__name__}: {elapsed_time:0.4f} seconds")
        return value
    return wrapper_timer


class MyTimer():
    # usage:
    #
    # from MyTimer import MyTimer
    # with MyTimer():
    #    func(x,y)

    def __init__(self):
        self.start = time.time()
        self.start_p = time.perf_counter()

    def __enter__(self):
        return self

    def __exit__(self, *args, **kwargs):
        end = time.time()
        end_p = time.perf_counter()
        runtime = end - self.start
        runtime_p = end_p - self.start_p
        msg = 'The function took {time} seconds to complete'
        print(msg.format(time=runtime))
        msg_p = 'The function took {time} perf seconds to complete'
        print(msg_p.format(time=runtime_p))

In [4]:
r"""
conda env remove --name trOCR
conda env create --name trOCR --file environment.yml

cache folder
C:\Users\techexpert\.cache\huggingface\hub

nvidia-smi for GPU info

cd scripts
python trOCR.py
"""

from PIL import Image 
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel 
import os
import requests


@my_timer
# https://huggingface.co/microsoft/trocr-base-handwritten
def run_trOCR(model_name="microsoft/trocr-base-handwritten", images=""):
    """
    There are 3 main models to choose from, small, base and large. 
    Some other fine-tuned models: IAM Handwritten, SROIE Receipts
    """
    processor = TrOCRProcessor.from_pretrained(model_name, use_fast = True)
    model = VisionEncoderDecoderModel.from_pretrained(model_name)
    print(model)

    # Check for GPU availability
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"running on {device}")
    model.to(device)  # Move model to GPU
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    generated_ids = model.generate(pixel_values, max_new_tokens=1000)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)


In [16]:
def get_n_parameters(model_id = "microsoft/trocr-base-handwritten"):
    processor  = TrOCRProcessor.from_pretrained(model_id, use_fast = True)
    model = VisionEncoderDecoderModel.from_pretrained(model_id)
    encoder_params = 0 
    decoder_params = 0 
    all_params = 0 
    for name, params in model.named_parameters():
        numParam = params.numel()
        all_params += numParam 
        if 'encoder' in name: 
            encoder_params += numParam
        elif 'decoder' in name:
            decoder_params += numParam
    print(f"Number of parameters: {all_params/1000000}M")
    print(f"Number of encoder's parameters: {encoder_params/1000000}M")
    print(f"Number of decoder's parameters: {decoder_params/1000000}M")

In [17]:
get_n_parameters("microsoft/trocr-large-handwritten")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 558.226432M
Number of encoder's parameters: 355.072M
Number of decoder's parameters: 203.154432M


In [19]:
model_id = "microsoft/trocr-base-handwritten" # indus tre, This is a sample of text

link_image = "datasets/text_recognition_mcocr_data/text_recognition_mcocr_data/mcocr_public_145014qrfai_0.jpg" # 
image = Image.open(link_image).convert("RGB")
run_trOCR(model_id, image)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

## Image to latex

In [21]:
!pip install optimum

Collecting optimum
  Downloading optimum-2.0.0-py3-none-any.whl.metadata (14 kB)
Downloading optimum-2.0.0-py3-none-any.whl (162 kB)
Installing collected packages: optimum
Successfully installed optimum-2.0.0
[0m

In [26]:
!pip install optimum[onnxruntime]

Collecting optimum-onnx[onnxruntime] (from optimum[onnxruntime])
  Downloading optimum_onnx-0.0.1-py3-none-any.whl.metadata (4.7 kB)
Collecting onnxruntime>=1.18.0 (from optimum-onnx[onnxruntime]; extra == "onnxruntime"->optimum[onnxruntime])
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting coloredlogs (from onnxruntime>=1.18.0->optimum-onnx[onnxruntime]; extra == "onnxruntime"->optimum[onnxruntime])
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting flatbuffers (from onnxruntime>=1.18.0->optimum-onnx[onnxruntime]; extra == "onnxruntime"->optimum[onnxruntime])
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.18.0->optimum-onnx[onnxruntime]; extra == "onnxruntime"->optimum[onnxruntime])
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading optimum_onnx-0.0.1-py3-non

In [27]:
!pip show optimum
from optimum.onnxruntime import ORTModelForVision2Seq

Name: optimum
Version: 2.0.0
Summary: Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality.
Home-page: https://github.com/huggingface/optimum
Author: HuggingFace Inc. Special Ops Team
Author-email: hardware@huggingface.co
License: Apache
Location: /usr/local/lib/python3.12/dist-packages
Requires: huggingface_hub, numpy, packaging, torch, transformers
Required-by: optimum-onnx


Multiple distributions found for package optimum. Picked distribution: optimum-onnx


In [32]:
#! pip install transformers>=4.37.0 pillow optimum[onnxruntime]
import requests
from io import BytesIO

from PIL import Image
from transformers import TrOCRProcessor
from optimum.onnxruntime import ORTModelForVision2Seq

processor = TrOCRProcessor.from_pretrained('breezedeus/pix2text-mfr')
model = ORTModelForVision2Seq.from_pretrained('breezedeus/pix2text-mfr', use_cache=False)

def download_img(url):
    response = requests.get(url)
    image_file = BytesIO(response.content)
    return Image.open(image_file).convert('RGB')


image_fps = [
    'https://raw.githubusercontent.com/breezedeus/Pix2Text/main/docs/examples/formula.jpg',
    'https://raw.githubusercontent.com/breezedeus/Pix2Text/main/docs/examples/math-formula-42.png',
]
images = [download_img(fp) for fp in image_fps]
pixel_values = processor(images=images, return_tensors="pt").pixel_values
# print(f'pixel_values', pixel_values)
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
print(f'generated_ids: {generated_ids}, \ngenerated text: {generated_text}')


Could not find any ONNX files with standard file name decoder_model_merged.onnx, files found: [PosixPath('decoder_model.onnx'), PosixPath('encoder_model.onnx')]. Please make sure to pass a `file_name` and/or `subfolder` argument to `from_pretrained` when loading an ONNX file with non-standard file names.


generated_ids: tensor([[  2,  95, 263, 353, 380, 261, 264, 262, 263, 346, 262, 313, 338, 313,
         323, 281, 296, 307, 261, 261, 270, 263, 357, 264, 262, 293, 270, 268,
         261, 265, 262, 359, 261, 263, 357, 264, 262, 372, 270, 268, 261, 265,
         262, 429, 261, 262, 263, 353, 386, 261, 264, 262, 372, 261, 265, 262,
         263, 346, 262, 267, 313, 338, 313, 323, 281, 296, 307, 266, 261, 261,
         263, 303,  12, 263, 303, 596, 263, 415, 262, 379, 261, 264, 262, 293,
         271, 372, 261, 272, 282, 264, 262, 293, 271, 372, 261, 263, 304, 596,
         265, 262, 269, 261, 263, 304,  13,   2],
        [  2,  64, 705, 264, 262, 282, 263, 512, 263, 277, 262, 268, 261, 262,
         333, 261, 261, 263, 277, 262, 268, 272, 333, 265, 262, 282, 272, 263,
         277, 262, 268, 261, 262, 333, 261, 261, 261, 262, 268, 272, 333, 282,
         261,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

Code latex:
$$
{ \cal L } _ { \mathrm { e y e l i d \, } } = \sum _ { t = 1 } ^ { T } \sum _ { v = 1 } ^ { V } { \cal M } _ { v } ^ { \mathrm { ( e y e l i d \, ) } } \left( \left\| \hat { h } _ { t, v } - x _ { t, v } \right\| ^ { 2 } \right)
$$

$$
\lim _ { x \rightarrow \frac { 1 } { 4 } } \frac { 1 - 4 ^ { x - \frac { 1 } { 4 } } } { 1 - 4 x }
$$

In [33]:
print(model.config)

VisionEncoderDecoderConfig {
  "architectures": [
    "VisionEncoderDecoderModel"
  ],
  "decoder": {
    "activation_dropout": 0.0,
    "activation_function": "relu",
    "add_cross_attention": true,
    "attention_dropout": 0.0,
    "classifier_dropout": 0.0,
    "cross_attention_hidden_size": 384,
    "d_model": 256,
    "decoder_attention_heads": 8,
    "decoder_ffn_dim": 1024,
    "decoder_layerdrop": 0.0,
    "decoder_layers": 6,
    "dropout": 0.1,
    "init_std": 0.02,
    "is_decoder": true,
    "layernorm_embedding": true,
    "max_position_embeddings": 512,
    "model_type": "trocr",
    "scale_embedding": true,
    "tie_word_embeddings": false,
    "use_cache": false,
    "use_learned_position_embeddings": true,
    "vocab_size": 1200
  },
  "decoder_start_token_id": 2,
  "encoder": {
    "attention_probs_dropout_prob": 0.0,
    "encoder_stride": 16,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 384,
    "image_size": 384,
    "initializer_ran

In [38]:
print(model.encoder.config)

VisionEncoderDecoderConfig {
  "architectures": [
    "VisionEncoderDecoderModel"
  ],
  "decoder": {
    "activation_dropout": 0.0,
    "activation_function": "relu",
    "add_cross_attention": true,
    "attention_dropout": 0.0,
    "classifier_dropout": 0.0,
    "cross_attention_hidden_size": 384,
    "d_model": 256,
    "decoder_attention_heads": 8,
    "decoder_ffn_dim": 1024,
    "decoder_layerdrop": 0.0,
    "decoder_layers": 6,
    "dropout": 0.1,
    "init_std": 0.02,
    "is_decoder": true,
    "layernorm_embedding": true,
    "max_position_embeddings": 512,
    "model_type": "trocr",
    "scale_embedding": true,
    "tie_word_embeddings": false,
    "use_cache": false,
    "use_learned_position_embeddings": true,
    "vocab_size": 1200
  },
  "decoder_start_token_id": 2,
  "encoder": {
    "attention_probs_dropout_prob": 0.0,
    "encoder_stride": 16,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 384,
    "image_size": 384,
    "initializer_ran

In [40]:
!pip install onnx2pytorch

Collecting onnx2pytorch
  Downloading onnx2pytorch-0.5.1-py3-none-any.whl.metadata (3.8 kB)
Downloading onnx2pytorch-0.5.1-py3-none-any.whl (46 kB)
Installing collected packages: onnx2pytorch
Successfully installed onnx2pytorch-0.5.1
[0m

In [41]:
import onnx
from onnx2pytorch import ConvertModel

pytorch_model = ConvertModel(model)

print(pytorch_model)


AttributeError: 'ORTModelForVision2Seq' object has no attribute 'graph'

In [44]:
print(model.model_save_dir)


/root/.cache/huggingface/hub/models--breezedeus--pix2text-mfr/snapshots/bea257edb2653f2ae413b084f2ac0e8299d08df0


In [50]:
!ls -alh /root/.cache/huggingface/hub/models--breezedeus--pix2text-mfr/snapshots/bea257edb2653f2ae413b084f2ac0e8299d08df0

total 16K
drwxr-xr-x 2 root root 4.0K Oct 15 06:51 .
drwxr-xr-x 3 root root 4.0K Oct 15 06:50 ..
lrwxrwxrwx 1 root root   52 Oct 15 06:50 config.json -> ../../blobs/c6f828ccd5f3e8781dc7c7a715bc4b8f80ff41bc
lrwxrwxrwx 1 root root   76 Oct 15 06:51 decoder_model.onnx -> ../../blobs/fd0f92d7a012f3dae41e1ac79421aea0ea888b5a66cb3f9a004e424f82f3daed
lrwxrwxrwx 1 root root   76 Oct 15 06:51 encoder_model.onnx -> ../../blobs/bd8d5c322792e9ec45793af5569e9748f82a3d728a9e00213dbfc56c1486f37d
lrwxrwxrwx 1 root root   52 Oct 15 06:51 generation_config.json -> ../../blobs/a3d09b3add4319b3c2d0ca15011f3618109df47b
lrwxrwxrwx 1 root root   52 Oct 15 06:50 preprocessor_config.json -> ../../blobs/c2bbec3a0dbefdd3ecce8a82458664790ce39b20
lrwxrwxrwx 1 root root   52 Oct 15 06:50 special_tokens_map.json -> ../../blobs/b1879d702821e753ffe4245048eee415d54a9385
lrwxrwxrwx 1 root root   52 Oct 15 06:50 tokenizer.json -> ../../blobs/c07aa39397f33b7822ef84e435e911a70a4ce303
lrwxrwxrwx 1 root root   52 Oct 15 06:5

In [51]:
import onnx
from onnx2pytorch import ConvertModel

onnx_path = "/root/.cache/huggingface/hub/models--breezedeus--pix2text-mfr/snapshots/bea257edb2653f2ae413b084f2ac0e8299d08df0/encoder_model.onnx"
onnx_model = onnx.load(onnx_path)  # Đây là đối tượng ModelProto
pytorch_model = ConvertModel(onnx_model)

print(pytorch_model)


  layer.weight.data = torch.from_numpy(numpy_helper.to_array(weight))


ConvertModel(
  (Conv_/embeddings/patch_embeddings/projection/Conv_output_0): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  (Shape_/embeddings/patch_embeddings/Shape_output_0): Shape()
  (Constant_/embeddings/patch_embeddings/Constant_output_0): Constant(constant=tensor([0]))
  (Constant_/embeddings/patch_embeddings/Constant_1_output_0): Constant(constant=tensor([0]))
  (Constant_/embeddings/patch_embeddings/Constant_2_output_0): Constant(constant=tensor([2]))
  (Slice_/embeddings/patch_embeddings/Slice_output_0): Slice()
  (Constant_/embeddings/patch_embeddings/Constant_3_output_0): Constant(constant=tensor([-1]))
  (Reshape_/embeddings/patch_embeddings/Reshape_output_0): Reshape(shape=None)
  (Transpose_/embeddings/patch_embeddings/Transpose_output_0): Transpose()
  (Shape_/embeddings/Shape_output_0): Shape()
  (Constant_/embeddings/Constant_output_0): Constant(constant=0)
  (Gather_/embeddings/Gather_output_0): Gather()
  (Unsqueeze_/embeddings/Unsqueeze_output_0): Unsquee

In [57]:
i = 0
for name, param in pytorch_model.named_parameters():
    # if i > 1: 
    #     break
    print(name, param.shape)
    # print(param.data)
    # i += 1

Conv_/embeddings/patch_embeddings/projection/Conv_output_0.weight torch.Size([384, 3, 16, 16])
Conv_/embeddings/patch_embeddings/projection/Conv_output_0.bias torch.Size([384])
MatMul_/encoder/layer.0/attention/attention/query/Add_output_0.weight torch.Size([384, 384])
MatMul_/encoder/layer.0/attention/attention/query/Add_output_0.bias torch.Size([384])
MatMul_/encoder/layer.0/attention/attention/key/Add_output_0.weight torch.Size([384, 384])
MatMul_/encoder/layer.0/attention/attention/key/Add_output_0.bias torch.Size([384])
MatMul_/encoder/layer.0/attention/attention/value/Add_output_0.weight torch.Size([384, 384])
MatMul_/encoder/layer.0/attention/attention/value/Add_output_0.bias torch.Size([384])
MatMul_/encoder/layer.0/attention/output/dense/Add_output_0.weight torch.Size([384, 384])
MatMul_/encoder/layer.0/attention/output/dense/Add_output_0.bias torch.Size([384])
MatMul_/encoder/layer.0/intermediate/dense/Add_output_0.weight torch.Size([1536, 384])
MatMul_/encoder/layer.0/interm