In [1]:
!pip install  -U git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git datasets bitsandbytes peft qwen-vl-utils wandb accelerate
# Tested with transformers==4.47.0.dev0, trl==0.12.0.dev0, datasets==3.0.2, bitsandbytes==0.44.1, peft==0.13.2, qwen-vl-utils==0.0.8, wandb==0.18.5, accelerate==1.0.1

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /scratch.shared/1/afremund/job_10665424.pbs-m1/pip-req-build-_qalfyub
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /scratch.shared/1/afremund/job_10665424.pbs-m1/pip-req-build-_qalfyub
  Resolved https://github.com/huggingface/transformers.git to commit a847d4aa6bd2279f5be235dc0fd862f58f7403d1
  Installing build dependencies ... done
[?25h  Getting requirements to build wheel ... [?25done
[?25h  Preparing metadata (pyproject.toml) ... [?2done
[?25hCollecting git+https://github.com/huggingface/trl.git
  Cloning https://github.com/huggingface/trl.git to /scratch.shared/1/afremund/job_10665424.pbs-m1/pip-req-build-hddfk48e
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl.git /scratch.shar

In [4]:
import json
with open('NER_annotations_with_texts_2_TestTrainVal.json', 'r', encoding='utf-8') as fr:
    data = json.load(fr)

In [5]:
unique_ner = list()
for i in range(len(data['annotations'])):
    annotation = data['annotations'][i]
    attributes = annotation.get("attributes", {})
    
    if "TestTrainVal" not in attributes:
        continue
    labels = [ent["entity"] for ent in attributes.get("ner_entities", [])]
    unique_ner.extend(labels)
print(set(unique_ner))

{'ORG', 'LOCATION', 'PERSON', 'DATE'}


In [6]:
import cv2
import numpy as np
from PIL import Image
from io import BytesIO
def get_image(image_path, bbox):
    # Load the image
    image = cv2.imread(image_path)
    
    # Convert BGR (OpenCV default) to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Convert bounding box coordinates to integers
    x, y, width, height = map(int, bbox)

    # Crop the image
    cropped_image = image[y:y+height, x:x+width]

    # Convert cropped image to PIL Image
    pil_image = Image.fromarray(cropped_image)

    # Save to a BytesIO buffer as PNG and reload as PngImageFile
    buffer = BytesIO()
    pil_image.save(buffer, format="PNG")
    buffer.seek(0)
    png_image = Image.open(buffer)

    return png_image

In [None]:
from tqdm import tqdm
pathIMG = "path_to_images"
# Initialize dataset structure
dataset = {"image": [], "label": [], "split": []}
for annotation in tqdm(data["annotations"], desc='Preparing Dataset'):
    attributes = annotation.get("attributes", {})
    
    if "TestTrainVal" not in attributes:
        continue

    idx_actual_image = annotation["image_id"] - 1
    image_path = pathIMG + data["images"][idx_actual_image]["file_name"]
    bbox = annotation["bbox"]
    cropped_image = get_image(image_path, bbox)

    labels = {
                'qwen_ocr_text': attributes.get("Text", '')
             }

    dataset_key = attributes["TestTrainVal"]
    dataset["image"].append(cropped_image)
    dataset["label"].append(labels)
    dataset["split"].append(attributes["TestTrainVal"])  # Stores "train", "test", or "val"

Preparing Dataset: 100%|██████████| 89788/89788 [22:56<00:00, 65.24it/s]  


In [8]:
from datasets import Dataset
# Convert dictionary to Hugging Face dataset
hf_dataset = Dataset.from_dict(dataset)

In [9]:
# Split dataset based on the "split" column
train_dataset = hf_dataset.filter(lambda x: x["split"] == "train")
test_dataset = hf_dataset.filter(lambda x: x["split"] == "test")
val_dataset = hf_dataset.filter(lambda x: x["split"] == "val")

# Remove the "split" column as it's no longer needed
train_dataset = train_dataset.remove_columns("split")
test_dataset = test_dataset.remove_columns("split")
val_dataset = val_dataset.remove_columns("split")

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

In [10]:
sizes = []
for i in range(0, len(hf_dataset) - 1):
    sizes.append(hf_dataset[i]['image'].size)

In [11]:
import numpy as np
from scipy.stats import zscore

# Extract widths and heights
widths = np.array([size[0] for size in sizes])
heights = np.array([size[1] for size in sizes])

### 1. DETECT OUTLIERS

# PERCENTILE METHOD
percentile_threshold = 99.5  # Remove top 1% largest images
w_percentile = np.percentile(widths, percentile_threshold)
h_percentile = np.percentile(heights, percentile_threshold)
percentile_outliers = {(w, h) for w, h in zip(widths, heights) if w > w_percentile or h > h_percentile}

# Combine all outliers
all_outliers = percentile_outliers

### 2. REMOVE OUTLIERS FROM hf_dataset
filtered_dataset = [hf_dataset[i] for i in range(len(hf_dataset)) if hf_dataset[i]['image'].size not in all_outliers]

### 3. PRINT RESULTS
print(f"Original dataset size: {len(hf_dataset)}")
print(f"Filtered dataset size: {len(filtered_dataset)}")
print(f"Removed {len(hf_dataset) - len(filtered_dataset)} outliers.")

Original dataset size: 11030
Filtered dataset size: 10921
Removed 109 outliers.


In [12]:
# Convert back to Hugging Face Dataset
tmp_dataset = {}
tmp_dataset['image'] = [rec['image'] for rec in filtered_dataset]
tmp_dataset['label'] = [rec['label'] for rec in filtered_dataset]
tmp_dataset['split'] = [rec['split'] for rec in filtered_dataset]

In [13]:
from datasets import Dataset
# Convert dictionary to Hugging Face dataset
filtered_dataset = Dataset.from_dict(tmp_dataset)

In [14]:
# Split dataset based on the "split" column
train_dataset = filtered_dataset.filter(lambda x: x["split"] == "train")
test_dataset = filtered_dataset.filter(lambda x: x["split"] == "test")
val_dataset = filtered_dataset.filter(lambda x: x["split"] == "val")

# Remove the "split" column as it's no longer needed
train_dataset = train_dataset.remove_columns("split")
test_dataset = test_dataset.remove_columns("split")
val_dataset = val_dataset.remove_columns("split")

Filter:   0%|          | 0/10921 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10921 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10921 [00:00<?, ? examples/s]

In [15]:
system_message = """You are a Vision-Language Model specialized in OCR for multilingual images, primarily containing typewritten text in Ukrainian, Russian, and other languages.  

Your task is to:  
1. **Extract all text** from the provided image with high accuracy.  

Respond concisely in a structured format, providing:  
- The full extracted text  

Focus on precision, ensuring the output reflects the text in the image. Avoid unnecessary explanations or interpretations beyond the detected content."""

user_prompt = """Perform OCR on the provided image containing typewritten multilingual text (primarily Ukrainian, Russian, and other languages).  
Extract and transcribe all text from the entire scanned document accurately.  

Return the output in a structured JSON format with the following key:  
- `"qwen_ocr_text"`: The complete transcribed text from the document  

Ensure high accuracy in text extraction. Do not include unnecessary explanations in the response.
"""  

def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": user_prompt,
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": json.dumps(sample['label'], indent=2, ensure_ascii=False)}],
        },
    ]

In [16]:
train_dataset = [format_data(sample) for sample in train_dataset]
val_dataset = [format_data(sample) for sample in val_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

In [17]:
import gc
import time
import torch


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)
    
    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)
    model = None
    processor = None
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    gc.collect()
    time.sleep(2)
    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 0.00 GB
GPU reserved memory: 0.00 GB


In [18]:
from transformers import BitsAndBytesConfig
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForImageTextToText.from_pretrained(
    model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=bnb_config
)
processor = AutoProcessor.from_pretrained(model_id)


2025-04-29 11:37:44.828399: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-29 11:37:45.449059: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-29 11:37:45.652243: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8473] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-29 11:37:45.705907: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1471] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-29 11:37:46.138084: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [19]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 2,523,136 || all params: 8,294,689,792 || trainable%: 0.0304


In [20]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from trl import SFTConfig

# Configure training arguments
training_args = SFTConfig(
    output_dir="output_dir",  # Directory to save the model
    num_train_epochs=8,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    lr_scheduler_type="constant",  # Type of learning rate scheduler
    # Logging and evaluation
    logging_steps=10,  # Steps interval for logging
    eval_steps=100,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=100,  # Steps interval for saving
    metric_for_best_model="eval_loss",  # Metric to evaluate the best model
    greater_is_better=False,  # Whether higher metric values are better
    load_best_model_at_end=True,  # Load the best model after training
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    tf32=True,  # Use TensorFloat-32 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    push_to_hub=True,  # Whether to push model to Hugging Face Hub
    report_to="none",  # Reporting tool for tracking metrics
    # Gradient checkpointing settings
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    # Dataset configuration
    dataset_text_field="",  # Text field in dataset
    dataset_kwargs={"skip_prepare_dataset": True},  # Additional dataset options
    # max_seq_length=1024  # Maximum sequence length for input
)

training_args.remove_unused_columns = False  # Keep unused columns in dataset

In [22]:
from qwen_vl_utils import process_vision_info
from transformers import Qwen2_5_VLProcessor
# Create a data collator to encode text and image pairs
def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [
        processor.apply_chat_template(example, tokenize=False) for example in examples
    ]  # Prepare texts for processing
    image_inputs = [process_vision_info(example)[0] for example in examples]  # Process the images to extract inputs

    # Tokenize the texts and process the images
    batch = processor(
        text=texts, images=image_inputs, return_tensors="pt", padding=True
    )  # Encode texts and images into tensors

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()  # Clone input IDs for labels
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels

    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2_5_VLProcessor):  # Check if the processor is Qwen2VLProcessor
        image_tokens = [151652, 151653, 151655]  # Specific image token IDs for Qwen2VLProcessor
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]  # Convert image token to ID

    # Mask image token IDs in the labels
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels  # Add labels to the batch

    return batch  # Return the prepared batch

In [23]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [24]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
100,0.0524,0.057661
200,0.0508,0.053958
300,0.0482,0.051726
400,0.049,0.050835
500,0.0433,0.049292
600,0.0544,0.04899
700,0.0485,0.048457
800,0.049,0.047956
900,0.0377,0.047802
1000,0.036,0.048214


TrainOutput(global_step=2176, training_loss=0.08643319615272477, metrics={'train_runtime': 18515.2975, 'train_samples_per_second': 3.76, 'train_steps_per_second': 0.118, 'total_flos': 2.5206661176820224e+18, 'train_loss': 0.08643319615272477})

In [25]:
trainer.save_model(training_args.output_dir)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
