1. Read the VLLm 
2. Learn how to use it efficiently 


In [None]:
import os
from pdf2image import convert_from_path
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
import logging
from typing import Dict
from huggingface_hub import  snapshot_download

# os.environ['HF_TOKEN'] = ''
os.environ['HF_HUB_ENABLE_HF_TRANSFER']='1' 

# Setup logging
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

INFO 03-19 15:58:55 [__init__.py:256] Automatically detected platform cuda.


In [2]:
qari_lora_path = snapshot_download(repo_id="NAMAA-Space/Qari-OCR-0.2.2-Arabic-2B-Instruct")

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

In [3]:
from tqdm import tqdm

def convert_pdf_with_qwen_vllm(pdf_path: str, start_page: int = 1, end_page: int = None) -> Dict[int, str]:
    start_idx = start_page - 1
    end_idx = end_page if end_page is not None else None

    _log.info(f"Converting PDF {pdf_path} to images for pages {start_page} to {end_page or 'end'}...")
    try:
        images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
    except Exception as e:
        _log.error(f"Failed to convert PDF to images: {e}")
        return {}

    if not images:
        _log.warning("No images extracted from PDF.")
        return {}

    model_name = "Qwen/Qwen2-VL-2B-Instruct"
    _log.info(f"Loading model {model_name} with vLLM...")
    try:
        llm = LLM(
            model=model_name,
            enable_lora=True,
            dtype="float16",
            max_model_len=4096,
            # max_num_seqs=1,
            disable_mm_preprocessor_cache=False,
        )
    except Exception as e:
        _log.error(f"Failed to load Qwen2.5-VL model with vLLM: {e}")
        return {}

    question = "Extract all text from this image"
    prompt_template = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    sampling_params = SamplingParams(
        temperature=0.0,  # Greedy decoding for consistency
        max_tokens=4096,   # Adjust based on expected text length
        stop_token_ids=None,
    )

    extracted_texts = {}
    total_pages = len(images)
    with tqdm(total=total_pages, desc="Processing PDF pages", unit="page") as pbar:
        for i, image in enumerate(images, start=start_idx):
            page_num = i + 1  # Convert back to 1-based indexing
            _log.info(f"Processing page {page_num}...")

            inputs = {
                "prompt": prompt_template,
                "multi_modal_data": {
                    "image": image  # Pass PIL image directly
                },
            }

            try:
                outputs = llm.generate([inputs], sampling_params=sampling_params, lora_request=LoRARequest("qari_adapter", 1, qari_lora_path))
                generated_text = outputs[0].outputs[0].text.strip()
                extracted_texts[page_num] = generated_text
            except Exception as e:
                _log.error(f"Error processing page {page_num}: {e}")
                extracted_texts[page_num] = f"Error: {str(e)}"
            
            pbar.update(1)  # Update progress bar after each page

    return extracted_texts

In [4]:
pdf_file = "../cold_war_data/cold_war.pdf"  # Replace with your PDF path
start = 9
end = 11

result = convert_pdf_with_qwen_vllm(pdf_file, start_page=start, end_page=end)

# Print results
for page_num, text in result.items():
    print(f"Page {page_num} Text:")
    print(f"{text}")
    print('-' * 50)

INFO:__main__:Converting PDF ../cold_war_data/cold_war.pdf to images for pages 9 to 11...
INFO:__main__:Loading model Qwen/Qwen2-VL-2B-Instruct with vLLM...


INFO 03-19 15:59:05 [config.py:583] This model supports multiple tasks: {'generate', 'classify', 'reward', 'embed', 'score'}. Defaulting to 'generate'.
INFO 03-19 15:59:05 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.0) with config: model='Qwen/Qwen2-VL-2B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2-VL-2B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=Non

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-19 15:59:09 [loader.py:429] Loading weights took 1.12 seconds
INFO 03-19 15:59:09 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-19 15:59:10 [model_runner.py:1146] Model loading took 4.1859 GB and 2.230534 seconds


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.




It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


INFO 03-19 15:59:18 [worker.py:267] Memory profiling takes 8.31 seconds
INFO 03-19 15:59:18 [worker.py:267] the current vLLM instance can use total_gpu_memory (23.64GiB) x gpu_memory_utilization (0.90) = 21.27GiB
INFO 03-19 15:59:18 [worker.py:267] model weights take 4.19GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 0.33GiB; the rest of the memory reserved for KV Cache is 16.70GiB.
INFO 03-19 15:59:19 [executor_base.py:111] # cuda blocks: 39096, # CPU blocks: 9362
INFO 03-19 15:59:19 [executor_base.py:116] Maximum concurrency for 4096 tokens per request: 152.72x
INFO 03-19 15:59:21 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:24<00:00,  1.45it/s]

INFO 03-19 15:59:45 [model_runner.py:1570] Graph capturing finished in 24 secs, took 0.41 GiB
INFO 03-19 15:59:45 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 35.72 seconds



Processing PDF pages:   0%|          | 0/3 [00:00<?, ?page/s]INFO:__main__:Processing page 9...
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it, est. speed input: 1655.63 toks/s, output: 11.54 toks/s]
Processing PDF pages:  33%|███▎      | 1/3 [00:04<00:09,  4.57s/page]INFO:__main__:Processing page 10...
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it, est. speed input: 2122.26 toks/s, output: 5.45 toks/s]
Processing PDF pages:  67%|██████▋   | 2/3 [00:06<00:02,  2.76s/page]INFO:__main__:Processing page 11...
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it, est. speed input: 1806.82 toks/s, output: 19.21 toks/s]
Processing PDF pages: 100%|██████████| 3/3 [00:07<00:00,  2.59s/page]

Page 9 Text:
مقدمة الأعمال الكاملة للكاتب والمترجم طلعت الشايب
--------------------------------------------------
Page 10 Text:
أدب الحرب الباردة
--------------------------------------------------
Page 11 Text:
مقدّمة المحرّر بين الخطاب والرد: (أفكار تمهيدية عن كتابة الحرب الباردة)
--------------------------------------------------



