In [1]:
!pip install  -U -q git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git datasets bitsandbytes peft qwen-vl-utils wandb accelerate
# Tested with transformers==4.47.0.dev0, trl==0.12.0.dev0, datasets==3.0.2, bitsandbytes==0.44.1, peft==0.13.2, qwen-vl-utils==0.0.8, wandb==0.18.5, accelerate==1.0.1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf 24.10.0 requires pyarrow<18.0.0a0,>=14.0.0, but you have pyarrow 21.0.0 which is incompatible.[0m[31m
[0m

In [1]:
import json

with open(
    "NER_annotations_with_texts_2_TestTrainVal_Qwen7BftOCR.json", "r", encoding="utf-8"
) as fr:
    data = json.load(fr)

In [2]:
unique_ner = list()
for i in range(len(data["annotations"])):
    annotation = data["annotations"][i]
    attributes = annotation.get("attributes", {})

    if "TestTrainVal" not in attributes:
        continue
    labels = [ent["entity"] for ent in attributes.get("ner_entities", [])]
    unique_ner.extend(labels)
print(set(unique_ner))

{'ORG', 'PERSON', 'DATE', 'LOCATION'}


In [None]:
from tqdm import tqdm

# Initialize dataset structure
dataset = {"input_text": [], "label": [], "split": []}
for annotation in tqdm(data["annotations"], desc="Preparing Dataset"):
    attributes = annotation.get("attributes", {})
    if "TestTrainVal" not in attributes:
        continue
    labels = [
        {"entity": ent["entity"], "text": ent["text"]}
        for ent in attributes.get("ner_entities", [])
    ]

    dataset_key = attributes["TestTrainVal"]
    dataset["label"].append(labels)
    dataset["input_text"].append(attributes["Text"])
    dataset["split"].append(
        attributes["TestTrainVal"]
    )  # Stores "train", "test", or "val"

Preparing Dataset: 100%|██████████| 89788/89788 [00:00<00:00, 1997794.07it/s]


In [9]:
from datasets import Dataset

# Convert dictionary to Hugging Face dataset
hf_dataset = Dataset.from_dict(dataset)

In [10]:
# Split dataset based on the "split" column
train_dataset = hf_dataset.filter(lambda x: x["split"] == "train")
test_dataset = hf_dataset.filter(lambda x: x["split"] == "test")
val_dataset = hf_dataset.filter(lambda x: x["split"] == "val")

# Remove the "split" column as it's no longer needed
train_dataset = train_dataset.remove_columns("split")
test_dataset = test_dataset.remove_columns("split")
val_dataset = val_dataset.remove_columns("split")

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

In [11]:
len(hf_dataset)

11030

In [12]:
system_message = """You are a Language Model specialized in detecting named entities in Ukrainian texts.
Your task is to analyze the provided text and identify named entities such as names, locations, organizations, and other key terms.
Respond concisely, typically providing the detected entities as a structured list or short phrases.
Focus on accuracy and ensure the extracted entities reflect the text. Avoid additional explanations unless absolutely necessary."""

user_prompt = """Analyze the provided Ukrainian text. 
Detect and extract named entities belonging to the following categories:
- PERSON (names of individuals)
- LOCATION (geographical places, cities, countries)
- DATE (specific dates, years, or time-related expressions)
- ORG (organizations, institutions, or companies)

Respond with a structured list of detected entities along with their corresponding entity types. 
Ensure accuracy and avoid adding unnecessary explanations."""

text_prompt = """Text:\n"""


def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_prompt,
                },
                {
                    "type": "text",
                    "text": text_prompt + sample["input_text"],
                },
            ],
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": json.dumps(sample["label"], indent=2, ensure_ascii=False),
                }
            ],
        },
    ]

In [13]:
train_dataset = [format_data(sample) for sample in train_dataset]
val_dataset = [format_data(sample) for sample in val_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

In [14]:
import gc
import time

import torch


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)
    model = None
    processor = None
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    gc.collect()
    time.sleep(2)
    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 0.00 GB
GPU reserved memory: 0.00 GB


In [15]:
import torch
from transformers import AutoProcessor, BitsAndBytesConfig, AutoModelForImageTextToText

# , AutoModelForImageTextToText
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model and tokenizer
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)
processor = AutoProcessor.from_pretrained(model_id)

2025-10-06 14:06:16.373147: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-06 14:06:16.776928: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-06 14:06:16.919481: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8473] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-06 14:06:16.956589: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1471] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-06 14:06:17.256803: I tensorflow/core/platform/cpu_feature_guar

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not in

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

In [16]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 2,523,136 || all params: 8,294,689,792 || trainable%: 0.0304


In [17]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [18]:
from trl import SFTConfig

# Configure training arguments
training_args = SFTConfig(
    output_dir="qwen2.5-7b-instruct-trl-sft-NAKI-NER-from-GT-text",  # Directory to save the model
    num_train_epochs=8,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    lr_scheduler_type="linear",  # Type of learning rate scheduler
    # Logging and evaluation
    logging_steps=10,  # Steps interval for logging
    eval_steps=100,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=100,  # Steps interval for saving
    metric_for_best_model="eval_loss",  # Metric to evaluate the best model
    greater_is_better=False,  # Whether higher metric values are better
    load_best_model_at_end=True,  # Load the best model after training
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    tf32=True,  # Use TensorFloat-32 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    push_to_hub=False,  # Whether to push model to Hugging Face Hub
    report_to="none",  # Reporting tool for tracking metrics
    # Gradient checkpointing settings
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # Options for gradient checkpointing
    # Dataset configuration
    dataset_text_field="",  # Text field in dataset
    dataset_kwargs={"skip_prepare_dataset": True},  # Additional dataset options
    # max_seq_length=1024  # Maximum sequence length for input
)

training_args.remove_unused_columns = False  # Keep unused columns in dataset

In [19]:
from qwen_vl_utils import process_vision_info
from transformers import Qwen2_5_VLProcessor


# Create a data collator to encode text and image pairs
def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [
        processor.apply_chat_template(example, tokenize=False) for example in examples
    ]  # Prepare texts for processing
    #image_inputs = [
    #    process_vision_info(example)[0] for example in examples
    #]  # Process the images to extract inputs

    # Tokenize the texts and process the images
    batch = processor(
        text=texts, return_tensors="pt", padding=True
    )  # Encode texts and images into tensors

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()  # Clone input IDs for labels
    labels[labels == processor.tokenizer.pad_token_id] = (
        -100
    )  # Mask padding tokens in labels

    # Ignore the image token index in the loss computation (model specific)
    if isinstance(
        processor, Qwen2_5_VLProcessor
    ):  # Check if the processor is Qwen2VLProcessor
        image_tokens = [
            151652,
            151653,
            151655,
        ]  # Specific image token IDs for Qwen2VLProcessor
    else:
        image_tokens = [
            processor.tokenizer.convert_tokens_to_ids(processor.image_token)
        ]  # Convert image token to ID

    # Mask image token IDs in the labels
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels  # Add labels to the batch

    return batch  # Return the prepared batch

In [20]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
100,0.4445,0.400528
200,0.4181,0.364808
300,0.3408,0.348265
400,0.3609,0.337174
500,0.3008,0.328055
600,0.3447,0.320678
700,0.3239,0.314431
800,0.3549,0.308401
900,0.3129,0.304775
1000,0.3036,0.301158


In [None]:
trainer.save_model(training_args.output_dir)