### IMPORTS

In [1]:
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from qwen_vl_utils import process_vision_info
import json
import pandas as pd
from tqdm import tqdm

2025-10-06 20:18:10.975838: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-06 20:18:10.987870: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-06 20:18:11.000966: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8473] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-06 20:18:11.004939: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1471] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-06 20:18:11.015761: I tensorflow/core/platform/cpu_feature_guar

### Loading model and processor

In [2]:
# Load model and tokenizer
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
model = AutoModelForImageTextToText.from_pretrained(
    model_id, 
    device_map="auto", 
    torch_dtype=torch.bfloat16
)
processor = AutoProcessor.from_pretrained(model_id)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# using adapter
adapter_path = "adapter_path"
model.load_adapter(adapter_path)

In [4]:
import json

with open(
    "NER_annotations_with_texts_2_TestTrainVal_Qwen7BftOCR.json", "r", encoding="utf-8"
) as fr:
    data = json.load(fr)

In [5]:
unique_ner = list()
for i in range(len(data["annotations"])):
    annotation = data["annotations"][i]
    attributes = annotation.get("attributes", {})

    if "TestTrainVal" not in attributes:
        continue
    labels = [ent["entity"] for ent in attributes.get("ner_entities", [])]
    unique_ner.extend(labels)
print(set(unique_ner))

{'ORG', 'DATE', 'PERSON', 'LOCATION'}


In [None]:
from tqdm import tqdm

# Initialize dataset structure
dataset = {"input_text": [], "label": [], "split": []}
for annotation in tqdm(data["annotations"], desc="Preparing Dataset"):
    attributes = annotation.get("attributes", {})
    if "TestTrainVal" not in attributes:
        continue
    labels = [
        {"entity": ent["entity"], "text": ent["text"]}
        for ent in attributes.get("ner_entities", [])
    ]

    dataset_key = attributes["TestTrainVal"]
    dataset["label"].append(labels)
    dataset["input_text"].append(attributes["Text"]) # this is the ground truth text - can be changed to OCR text
    dataset["split"].append(
        attributes["TestTrainVal"]
    )  # Stores "train", "test", or "val"

Preparing Dataset: 100%|██████████| 89788/89788 [00:00<00:00, 111709.43it/s]


In [7]:
from datasets import Dataset

# Convert dictionary to Hugging Face dataset
hf_dataset = Dataset.from_dict(dataset)

In [8]:
# Split dataset based on the "split" column
train_dataset = hf_dataset.filter(lambda x: x["split"] == "train")
test_dataset = hf_dataset.filter(lambda x: x["split"] == "test")
val_dataset = hf_dataset.filter(lambda x: x["split"] == "val")

# Remove the "split" column as it's no longer needed
train_dataset = train_dataset.remove_columns("split")
test_dataset = test_dataset.remove_columns("split")
val_dataset = val_dataset.remove_columns("split")

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11030 [00:00<?, ? examples/s]

In [9]:
system_message = """You are a Language Model specialized in detecting named entities in Ukrainian texts.
Your task is to analyze the provided text and identify named entities such as names, locations, organizations, and other key terms.
Respond concisely, typically providing the detected entities as a structured list or short phrases.
Focus on accuracy and ensure the extracted entities reflect the text. Avoid additional explanations unless absolutely necessary."""

user_prompt = """Analyze the provided Ukrainian text. 
Detect and extract named entities belonging to the following categories:
- PERSON (names of individuals)
- LOCATION (geographical places, cities, countries)
- DATE (specific dates, years, or time-related expressions)
- ORG (organizations, institutions, or companies)

Respond with a structured list of detected entities along with their corresponding entity types. 
Ensure accuracy and avoid adding unnecessary explanations."""

text_prompt = """Text:\n"""


def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_prompt,
                },
                {
                    "type": "text",
                    "text": text_prompt + sample["input_text"],
                },
            ],
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": json.dumps(sample["label"], indent=2, ensure_ascii=False),
                }
            ],
        },
    ]

In [10]:
train_dataset = [format_data(sample) for sample in train_dataset]
val_dataset = [format_data(sample) for sample in val_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

In [11]:
# this will be sent to model
train_dataset[0][:2]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a Language Model specialized in detecting named entities in Ukrainian texts.\nYour task is to analyze the provided text and identify named entities such as names, locations, organizations, and other key terms.\nRespond concisely, typically providing the detected entities as a structured list or short phrases.\nFocus on accuracy and ensure the extracted entities reflect the text. Avoid additional explanations unless absolutely necessary.'}]},
 {'role': 'user',
  'content': [{'type': 'text',
    'text': 'Analyze the provided Ukrainian text. \nDetect and extract named entities belonging to the following categories:\n- PERSON (names of individuals)\n- LOCATION (geographical places, cities, countries)\n- DATE (specific dates, years, or time-related expressions)\n- ORG (organizations, institutions, or companies)\n\nRespond with a structured list of detected entities along with their corresponding entity types. \nEnsure a

In [12]:
# generate function - prepares all necessary data for the inference
def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample[1:2], tokenize=False, add_generation_prompt=True  # Use the sample without the system message
    )

    # Process the visual input from the sample
    #image_inputs, _ = process_vision_info(sample)

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        return_tensors="pt",
    ).to(
        device
    )  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens, temperature=0.1)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [14]:
for i in range(0, len(test_dataset)-1):
    output = generate_text_from_sample(model, processor, test_dataset[i])
    print('Input text:', test_dataset[i][1]['content'][1]['text'])
    print('Model Prediction: ', eval(output))
    print('Truth Data: ', eval(test_dataset[i][2]['content'][0]['text']))
    #display(test_dataset[i][1]['content'][0]['image'])
    print('-'*100)
    if i > 50:
        break
        

Input text: Text:
5. Мылом по
Model Prediction:  []
Truth Data:  []
----------------------------------------------------------------------------------------------------
Input text: Text:
4. Сахаром по
Model Prediction:  []
Truth Data:  []
----------------------------------------------------------------------------------------------------
Input text: Text:
3. Выдано талонов на питание
Model Prediction:  []
Truth Data:  []
----------------------------------------------------------------------------------------------------
Input text: Text:
2. Сухим пайком на путь следования на
Model Prediction:  []
Truth Data:  []
----------------------------------------------------------------------------------------------------
Input text: Text:
1. Продовольствием в натуре по
Model Prediction:  []
Truth Data:  []
----------------------------------------------------------------------------------------------------
Input text: Text:
Номер и дата документа
Model Prediction:  []
Truth Data:  []
------------