In [1]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)

In [2]:
model = AutoModel.from_pretrained(
    "ucaslcl/GOT-OCR2_0",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    device_map="cuda",
    use_safetensors=True,
    pad_token_id=tokenizer.eos_token_id,
)

In [3]:
model = model.eval().cuda()

In [4]:
print(model)

GOTQwenForCausalLM(
  (model): GOTQwenModel(
    (embed_tokens): Embedding(151860, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1024,), eps=1e-06)
    (rotary_em

In [5]:
s = 0
for param in model.parameters():
    s += param.numel()

In [6]:
s / 1e6  # in millions of parameters

560.52864

In [7]:
s = 0
for param in model.model.vision_tower_high.parameters():
    s += param.numel()

In [8]:
s / 1e6  # in millions of parameters

95.569152

In [10]:
s = 0
for param in model.model.layers.parameters():
    s += param.numel()

In [11]:
s / 1e6  # in millions of parameters

308.404224

In [12]:
s = 0
for param in model.model.mm_projector_vary.parameters():
    s += param.numel()

In [13]:
s / 1e6  # in millions of parameters

1.0496

In [15]:
s = 0
for param in model.lm_head.parameters():
    s += param.numel()

In [16]:
s / 1e6  # in millions of parameters

155.50464

In [None]:
308 + 155

463

In [13]:
tokenizer

QWenTokenizer(name_or_path='ucaslcl/GOT-OCR2_0', vocab_size=151860, model_max_length=8000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	
}
)

In [None]:
# input your test image
image_file = "xxx.jpg"

# plain texts OCR
res = model.chat(tokenizer, image_file, ocr_type="ocr")

# format texts OCR:
# res = model.chat(tokenizer, image_file, ocr_type='format')

# fine-grained OCR:
# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_box='')
# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_box='')
# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_color='')
# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_color='')

# multi-crop OCR:
# res = model.chat_crop(tokenizer, image_file, ocr_type='ocr')
# res = model.chat_crop(tokenizer, image_file, ocr_type='format')

# render the formatted OCR results:
# res = model.chat(tokenizer, image_file, ocr_type='format', render=True, save_render_file = './demo.html')

print(res)