In [None]:
%cd /home/is/dwipraseetyo-a/NAS_HAI/Project/Qwen2.5-Omni
import pickle, os, re, random, torch
from swift.llm import get_model_tokenizer, load_dataset, get_template, EncodePreprocessor
from swift.utils import get_logger, find_all_linears, get_model_parameter_info, plot_images, seed_everything
from swift.tuners import Swift, LoraConfig
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
from functools import partial
import commons
import const_variable
import logging, warnings
class SuppressMultipleWarnings(logging.Filter):
    def filter(self, record):
        suppressed_msgs = [
            "Trainer.tokenizer is now deprecated",
            "System prompt modified, audio output may not work as expected"
        ]
        return not any(record.getMessage().startswith(msg) for msg in suppressed_msgs)
logging.getLogger().addFilter(SuppressMultipleWarnings())
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

logger = get_logger()
seed_everything(42)

# Hyperparameters for training
os.environ.update({
    "MAX_PIXELS": "1003520",
    "NPROC_PER_NODE": "4",
    "ENABLE_AUDIO_OUTPUT": "0",
    "CUDA_VISIBLE_DEVICES": "0,1"
})

model_id_or_path = 'Qwen/Qwen2.5-Omni-3B'
output_dir = 'outputs/qwen25omni3b-think-balance-grpo'

data_seed = 42
max_length = 2048
split_dataset_ratio = 0.01  # Split validation set
num_proc = 4  # The number of processes for data loading.

# lora
lora_rank = 8
lora_alpha = 32

# training_args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_checkpointing=True,
    weight_decay=0.1,
    lr_scheduler_type='cosine',
    warmup_ratio=0.05,
    report_to=['tensorboard'],
    logging_first_step=True,
    save_strategy='steps',
    save_steps=50,
    eval_strategy='steps',
    eval_steps=50,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    metric_for_best_model='loss',
    save_total_limit=2,
    logging_steps=5,
    dataloader_num_workers=1,
    data_seed=data_seed,
)

output_dir = os.path.abspath(os.path.expanduser(output_dir))
logger.info(f'output_dir: {output_dir}')

# Obtain the model and template, and add a trainable Lora layer on the model.
model, tokenizer = get_model_tokenizer(model_id_or_path)
template = get_template(model.model_meta.template, tokenizer, default_system=const_variable.system_prompt, max_length=max_length)
template.set_mode('train')
if template.use_model:
    template.model = model

target_modules = find_all_linears(model)
lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha,
                         target_modules=target_modules)
model = Swift.prepare_model(model, lora_config)
logger.info(f'lora_config: {lora_config}')

# Print model structure and trainable parameters.
model_parameter_info = get_model_parameter_info(model)
logger.info(f'model_parameter_info: {model_parameter_info}')

[INFO:swift] Global seed set to 42
[INFO:swift] output_dir: /home/ldap-users-2/dwipraseetyo-a/Project/Qwen2.5-Omni/outputs/qwen25omni3b-think-balance-grpo
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-Omni-3B


/home/ldap-users-2/dwipraseetyo-a/Project/Qwen2.5-Omni
Downloading Model from https://www.modelscope.cn to directory: /home/is/dwipraseetyo-a/.cache/modelscope/hub/models/Qwen/Qwen2.5-Omni-3B


[INFO:modelscope] Target directory already exists, skipping creation.
[INFO:swift] Loading the model using model_dir: /home/is/dwipraseetyo-a/.cache/modelscope/hub/models/Qwen/Qwen2___5-Omni-3B
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
[INFO:swift] Setting torch_dtype: torch.bfloat16
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
[INFO:swift] model_kwargs: {'device_map': 'auto'}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO:swift] default_system: 'A conversation between User and Advanced medical assistant specialized in analyzing and diagnosing clinical conditions. and the Assistant determines whether the case is Positive or Negative. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>'
[INFO:swift] max_length: 2048
[INFO:swift] response_prefix: ''
[INFO:swift] agent_template: hermes
[INFO:swift] norm_bbox: none
[INFO:swift] lora_config: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/home/is/dwipraseetyo-a/.cache/modelscope/hub/models/Qwen/Qwen2___5-Omni-3B', revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'up_proj', 'k', 'fc2', 'k_proj', 'attn.q', 'down_proj', 'audio_to

In [50]:
train_dataset, val_dataset = load_dataset(['AI-ModelScope/LaTeX_OCR#25'], split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,
                                          seed=data_seed)

[INFO:swift] Downloading the dataset from ModelScope, dataset_id: AI-ModelScope/LaTeX_OCR


In [92]:
from PIL import Image
from tqdm import tqdm
import commons
import const_variable
import copy
from qwen_omni_utils import process_mm_info
import io
import numpy as np
import soundfile as sf

def numpy_to_wav_bytes(audio_array: np.ndarray, sr: int) -> bytes:
    """
    Convert a NumPy array to WAV bytes in memory.
    """
    buf = io.BytesIO()
    sf.write(buf, audio_array, sr, format='WAV')
    return buf.getvalue()

def grpo_build_datasets(instruct, processor):
    datasets = []
    for sample in tqdm(instruct):
        image_found = False
        audio_found = False
        conversation  = copy.deepcopy(sample["messages"])
        for ele in conversation[1]['content']:
            if ele["type"] == "audio":
                if "audio" in ele or "audio_url" in ele:
                    path = ele.get("audio", ele.get("audio_url"))
                    start_sec, end_sec = commons.random_3sec_segment(path, segment_duration=3.0)
                    ele["audio_start"] = float(start_sec)
                    ele["audio_end"] = float(end_sec)
                    audio_found = True
            if ele["type"] == "image":
                if "image" in ele or "audio_url" in ele:
                    image_found = True

        audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)
        converted = [
            {
                "role": item["role"],
                "content": next(
                    (ele["text"] for ele in item["content"] if ele["type"] not in ("audio", "image")),
                    ""
                )
            }
            for item in conversation
        ]

        current_object = {'messages': converted}
        if images != None:
            current_object['images'] = images
        if audios != None:
            current_object['audios'] = [numpy_to_wav_bytes(audios[0], 16000)]

        if image_found == False:
            continue
        if audio_found == False:
            continue 

        datasets.append(current_object)
    return datasets

commons.pretty_status("📦 Loading Dataset...")
with open('datas/instruct_grpo_balance.pkl.dev', 'rb') as f:
    train_instruct = commons.load_image_PIL(pickle.load(f)[0:10])

with open('datas/instruct_grpo_balance.pkl.dev', 'rb') as f:
    dev_instruct = commons.load_image_PIL(pickle.load(f)[0:10])

train_dataset = Dataset.from_list(grpo_build_datasets(train_instruct, tokenizer)) #
val_dataset = Dataset.from_list(grpo_build_datasets(train_instruct, tokenizer))


| 📦 Loading Dataset... |



  0%|                                                                                                                                                                                                                 | 0/10 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 72.07it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 118.34it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 106.73it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 104.14it/s]


In [93]:
from swift.llm import (
    get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch,
    get_multimodal_target_regex, LazyLLMDataset
)

In [94]:
train_dataset = LazyLLMDataset(train_dataset, template.encode, random_state=data_seed)
val_dataset = LazyLLMDataset(val_dataset, template.encode, random_state=data_seed)

In [1]:
%cd /home/is/dwipraseetyo-a/NAS_HAI/Project/Qwen2.5-Omni
import pickle, os, re, random, torch
from peft import get_peft_model, LoraConfig, TaskType, get_peft_model_state_dict
from qwen_omni_utils import process_mm_info
from trl import SFTTrainer, SFTConfig, GRPOConfig, GRPOTrainer
from my_qwenwrapper import get_OmniModel
from sentence_transformers import SentenceTransformer, util

import commons
import const_variable
from my_datasets import QwenOmniFinetuneDataset

import logging, warnings
class SuppressMultipleWarnings(logging.Filter):
    def filter(self, record):
        suppressed_msgs = [
            "Trainer.tokenizer is now deprecated",
            "System prompt modified, audio output may not work as expected"
        ]
        return not any(record.getMessage().startswith(msg) for msg in suppressed_msgs)
logging.getLogger().addFilter(SuppressMultipleWarnings())

# warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings("ignore", message="System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'")
# warnings.filterwarnings("ignore", message=r"Trainer\.tokenizer.*deprecated")
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


/home/ldap-users-2/dwipraseetyo-a/Project/Qwen2.5-Omni


In [2]:
model, processor = get_OmniModel(model_path="Qwen/Qwen2.5-Omni-3B", processor_path="Qwen/Qwen2.5-Omni-3B", padding_side="left",
                                use_flash_attention=True, only_processor=False, quantize_4bit=True, 
                                offload_folder="offload", set_eval=False)

Loading Processsor.... Using Left padding Side


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Loading Model.... Using BitsAndBytesConfig
Loading Model.... Using Offload Folder
Loading Model.... Using Flash Attention


Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
with open('datas/instruct_grpo_balance.pkl.dev', 'rb') as f:
    dev_instruct = commons.load_image_PIL(pickle.load(f))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 439/439 [00:02<00:00, 153.29it/s]


In [26]:
conversation[1]['content']

[{'type': 'text',
  'text': 'The patient symptoms are body weight is 59.0, shortbreath is no. Given the cough audio and symptoms, is this case Positive Tuberculosis or Negative Tuberculosis?. '},
 {'type': 'audio', 'audio_start': 2.6, 'audio_end': 5.6}]

In [31]:
dev_datasets[0]

{'prompt': '<|im_start|>system\nA conversation between User and Advanced medical assistant specialized in analyzing and diagnosing clinical conditions. and the Assistant determines whether the case is Positive or Negative. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer><|im_end|>\n<|im_start|>user\nThe patient symptoms are shortbreath is no, tobacco use is never, height is 152.0, chestpain is no, night sweets is no, fever is no, hiv status is neg, sex is f, cough duration is no cough. The chest x-ray metadata are Modality is CR, ImagerPixelSpacing is [0.175, 0.175], Sensitivity is 250.0, PhotometricInterpretation is MONOCHROME2, Rows is 2446, Columns is 2010, WindowCenter is 2038.0, WindowWidth is 4096.0. Considering all inputs (image, so

In [16]:
print(prompt)

<|im_start|>system
A conversation between User and Advanced medical assistant specialized in analyzing and diagnosing clinical conditions. and the Assistant determines whether the case is Positive or Negative. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer><|im_end|>
<|im_start|>user
The patient symptoms are body weight is 59.0, shortbreath is no. Given the cough audio and symptoms, is this case Positive Tuberculosis or Negative Tuberculosis?. <|audio_bos|><|AUDIO|><|audio_eos|><|im_end|>
<|im_start|>assistant



In [21]:
print(solution)

<think>From the given symptoms and cough sound, Let me Analyze your regrading your questions.

## ⚠️ Points to Review and Disclaimer
* No X-ray Image Provided

This is a preliminary interpretation based on given data and does not replace a comprehensive clinical evaluation. A definitive diagnosis requires a additional clinical evaluation, including the physical examination findings, Cough Sound, Auscultation Sound, and imaging studies.
## 📋 Observations
**Symptoms:**
*   The symptoms provided do not support a diagnosis of Tuberculosis (TB). The patient has a productive cough lasting only 1-2 weeks, which is relatively short for TB, as the cough typically persists for months in TB cases. Additionally, the absence of hemoptysis, chest pain, shortness of breath, fever, night sweats, and weight loss further reduces the likelihood of TB, which often presents with these symptoms. The patient's negative HIV status and a healthy body mass index also support the absence of TB, given the higher 

In [None]:
def multimodal_collate_fn(batch):
    prompts = []
    for sample in batch:
        conversation  = sample["messages"]
        for ele in conversation[1]['content']:
            if ele["type"] == "audio":
                if "audio" in ele or "audio_url" in ele:
                    path = ele.get("audio", ele.get("audio_url"))
                    start_sec, end_sec = commons.random_3sec_segment(path, segment_duration=3.0)
                    ele["audio_start"] = float(start_sec)
                    ele["audio_end"] = float(end_sec)
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    return {
        "image": prompts,
        "image": prompts,
        "prompts": prompts,
        "solution": prompts,
    }

import re
from typing import Optional

def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completions]
    rewards = [1.0 if match else 0.0 for match in matches]
    return rewards

def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str], **kwargs) -> list[Optional[float]]:
    """Reward function that checks if the completion matches the ground truth.
    - If both gold and prediction are parseable → use math verification.
    - If not parseable → compare as normalized text.
    """
    rewards = []
    for completion, sol in zip(completions, solution):
        reward = float(completion.strip().lower() == sol.strip().lower())
        rewards.append(reward)
    return rewards

##########################################################################################################
commons.pretty_status("🧠 Loading Model...")

peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, 
                        inference_mode=False, 
                        r=8, 
                        lora_alpha=32, 
                        lora_dropout=0.05, 
                        target_modules=["q_proj", "v_proj"])
                        #target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"])

model, processor = get_OmniModel(model_path="Qwen/Qwen2.5-Omni-3B", processor_path="Qwen/Qwen2.5-Omni-3B", padding_side="left",
                                use_flash_attention=True, only_processor=False, quantize_4bit=True, 
                                offload_folder="offload", set_eval=False)

###
# How about we finetuning the audio and image encoder, not using PEFT, or increase PEFT to audio and image encoder

#model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()
model = peft_model.unload()
del peft_model

##########################################################################################################
commons.pretty_status("📦 Loading Dataset...")
with open('datas/instruct_grpo_balance.pkl.train', 'rb') as f:
    train_instruct = commons.load_image_PIL(pickle.load(f))

with open('datas/instruct_grpo_balance.pkl.dev', 'rb') as f:
    dev_instruct = commons.load_image_PIL(pickle.load(f))

train_dataset = QwenOmniFinetuneDataset(train_instruct, processor, use_audio_in_video=False)
dev_dataset = QwenOmniFinetuneDataset(dev_instruct, processor, use_audio_in_video=False)
print(train_dataset[0])
dataset = Dataset.from_list(instruct_array)