<a href="https://colab.research.google.com/github/Yuno2204/OCR/blob/main/OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers==4.44.2 bitsandbytes
!pip install -q flask flask-cors pyngrok flash_attn

In [None]:

import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import requests


# Thư viện xử lý ảnh đầu vào

In [None]:
# Thư viện xử lý ảnh đầu vào (từ source gốc HF)
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(requests.get(image_file, stream=True).raw).convert('RGB')#Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values



# Load model và test model trên Colab

In [None]:


model_name = "5CD-AI/Vintern-1B-v2"
model = AutoModel.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens= 512, do_sample=False, num_beams = 3, repetition_penalty=3.5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

configuration_internvl_chat.py: 0.00B [00:00, ?B/s]

configuration_intern_vit.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- configuration_internvl_chat.py
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internvl_chat.py: 0.00B [00:00, ?B/s]

conversation.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_intern_vit.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/5CD-AI/Vintern-1B-v2:
- modeling_internvl_chat.py
- conversation.py
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



# Test model




In [None]:
test_image = 'https://media-cdn-v2.laodong.vn/Storage/NewsPortal/2022/12/7/1124909/Karaoke-2.jpg'

pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()

prompt = '''<image>\nNhận diện hoá đơn trong ảnh. Chỉ trả về phần liệt kê các mặt hàng hàng dưới dạng JSON:
{
  "Tên món": "Tên món",
  "Số lượng": "Số lượng",
  "Đơn giá": "Đơn giá",
  "Thành tiền": "Thành tiền"
}
'''
response = model.chat(tokenizer, pixel_values, prompt, generation_config)

del pixel_values
response

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


"{'Tên món': ['Giờ VIP222', 'Suối', 'Hoa quả thập cẩm', 'Hoa quả Bưởi', 'Hoa Quả Roi', 'Ken ngoại'], 'Số lượng': ['1', '3', '1', '2', '1', '14'], 'Đơn giá': ['500 000', '12 000', '140 000', '220 000', '100 000', '60 000'], 'Thành tiền': ['950 000', '36 000', '140 000', '440 000', '100 000', '840 000']}"

# Triển khai Flask và Expose ra API qua Ngrok

In [None]:
# Setup Ngrok Token
from google.colab import userdata
from flask import Flask, jsonify, request
from flask_cors import CORS
from pyngrok import ngrok

authtoken = userdata.get("ngrok_token")
ngrok.set_auth_token(authtoken)



In [None]:
# Viết code Flask để expose ra API

# Initialize Flask app
app = Flask(__name__)
CORS(app)


prompt = '''<image>
Đây là hình ảnh Job Description (JD).

Hãy trích xuất các thông tin quan trọng sau:
- ViTriTuyenDung
- MoTaCongViec
- YeuCau (danh sách)
- QuyenLoi
Chỉ trả về kết quả dưới dạng JSON hợp lệ, không giải thích.
'''

@app.route('/ocr', methods=['POST'])
def index():
    data = request.json
    image_url = data.get('image_url', None)

    response_message = ocr_by_llm(image_url, prompt)

    return jsonify({
        "response_message": response_message
    })


def ocr_by_llm(image_url, prompt):
    # image = Image.open(requests.get(image_url, stream=True).raw)

    pixel_values = load_image(image_url, max_num=6).to(torch.bfloat16).cuda()

    response_message = model.chat(
        tokenizer,
        pixel_values,
        prompt,
        generation_config
    )

    del pixel_values

    print(response_message)
    return response_message


if __name__ == '__main__':
    ngrok_url = ngrok.connect(5555)
    print(ngrok_url)

    app.run(port=5555)

NgrokTunnel: "https://nonconvertibly-ungeographic-ranee.ngrok-free.dev" -> "http://localhost:5555"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5555
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [16/Jan/2026 12:56:51] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [16/Jan/2026 12:56:51] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [16/Jan/2026 12:57:59] "[33mPOST / HTTP/1.1[0m" 404 -
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [16/Jan/2026 13:00:54] "POST /ocr HTTP/1.1" 200 -


{
  "Vị trí cần tuyển": "Nhân viên kinh doanh",
  "Số lượng": "02 người",
  "Hồ sơ gồm": "CV ứng viên, Bằng cấp CC liên quan, Hộ khẩu, CMT, SYLL",
  "Mô tả công việc": "Thực hiện các Kinh doanh Thiết bị mạng, camera (sản phẩm CNTT). Xây dựng kế hoạch kinh doanh đảm bảo thực hiện hoàn thành theo chỉ tiêu kinh doanh theo cam kết. Chăm sóc khách hàng, đối tác và tham gia các sự kiện nếu có liên quan sản phẩm. Thực hiện các công việc liên quan nếu có trong quá trình triển khai nhiệm vụ theo phân công sản phẩm. Tuân thủ các Quy định hiện hành của Chính phủ và của Đơn vị.",
  "Yêu cầu": "- Nam/nữ Đại học trở lên chuyên ngành Kinh tế, quản trị, CNTT - Tốt nghiệp Đại học trở lên chuyên ngành Kinh tế, quản trị, CNTT - Có kinh nghiệm 02 năm trở lên trong các vị trí kinh doanh lĩnh vực CNTT - Giao tiếp tốt, nhanh nhẹn, cẩn trọng, trung thực.",
  "Quyền lợi": "- Mức Lương: Thỏa thuận theo kinh nhiệm - Hướng các chính sách theo quy định Luật lao động hiện hành, BHXH/ - Hướng thưởng Kinh doanh, Lươn

