In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"])

import torch
device_name = torch.cuda.get_device_name(0)
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"Logical index: {i}, Name: {props.name}")

CUDA_VISIBLE_DEVICES: 1
Logical index: 0, Name: NVIDIA GeForce RTX 3090


In [None]:
import pickle, os
from PIL import Image
from tqdm import tqdm
import numpy as np
import pickle
import librosa

import torch
from torch.utils.data import Dataset
from transformers import TrainingArguments, Qwen2_5OmniThinkerForConditionalGeneration, Qwen2_5OmniProcessor
from peft import get_peft_model, LoraConfig, TaskType
from trl import SFTTrainer
from qwen_omni_utils import process_mm_info

from transformers import Trainer, TrainingArguments
from datetime import datetime
from trl import SFTTrainer

import torch
import torch.nn.functional as F

def load_image_PIL(loaded_object):
    for obj in tqdm(loaded_object):
        for message in obj.get("messages", []):
            for content in message.get("content", []):
                if isinstance(content, dict) and "image" in content:
                    img_data = content["image"]
                    content["image"] = Image.fromarray(np.load(img_data)).convert("L")
    return loaded_object

class QwenOmniFinetuneDataset(Dataset):
    def __init__(self, data, processor, use_audio_in_video=False):
        self.data = data
        self.processor = processor
        self.use_audio_in_video = use_audio_in_video

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        conversation = self.data[idx]["messages"]
        return conversation

def collate_fn(conversation):
    text = self.processor.apply_chat_template(conversation, add_generation_prompt=False, tokenize=False) # add_generation_prompt?
    audios, images, videos = process_mm_info(conversation, use_audio_in_video=self.use_audio_in_video)
    batch = self.processor(text=text, audio=audios, images=images, videos=videos, 
                            return_tensors="pt", padding=True, use_audio_in_video=self.use_audio_in_video)
    

        # for k in inputs:
        #     inputs[k] = inputs[k][0]
        inputs
        inputs['use_audio_in_video'] = self.use_audio_in_video
        inputs["labels"] = inputs["input_ids"].clone()
# def default_collator(batch):
#     return {k: torch.nn.utils.rnn.pad_sequence([x[k] for x in batch], batch_first=True) if isinstance(batch[0][k], torch.Tensor) else [x[k] for x in batch] for k in batch[0]}


In [None]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-Omni-7B",
    torch_dtype=None,
    device_map="auto",
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
)
model = prepare_model_for_kbit_training(model) 

peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=128, lora_alpha=256, lora_dropout=0, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"])
model.enable_input_require_grads()
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:12<00:00,  2.56s/it]


trainable params: 412,172,288 || all params: 9,343,986,176 || trainable%: 4.4111


In [5]:
with open('/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz/instruct.pkl.train', 'rb') as f:
    train_instruct = load_image_PIL(pickle.load(f))

with open('/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz/instruct.pkl.dev', 'rb') as f:
    dev_instruct = load_image_PIL(pickle.load(f))

with open('/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz/instruct.pkl.test', 'rb') as f:
    test_instruct = load_image_PIL(pickle.load(f))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3952/3952 [00:27<00:00, 141.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 496/496 [00:03<00:00, 138.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 496/496 [00:03<00:00, 128.93it/s]


In [49]:
train_dataset = QwenOmniFinetuneDataset(train_instruct, processor, use_audio_in_video=False)
dev_dataset = QwenOmniFinetuneDataset(dev_instruct, processor, use_audio_in_video=False)
test_dataset = QwenOmniFinetuneDataset(test_instruct, processor, use_audio_in_video=False)

In [54]:
training_args = TrainingArguments(
    output_dir="./outputs/try1",
    num_train_epochs=3,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 4,
    warmup_ratio = 0.1,
    logging_dir='./outputs/try1/logs',
    learning_rate = 1e-5,
    logging_steps = 1,
    eval_steps=75,
    save_strategy="steps",
    save_steps=100,
    max_grad_norm=10.0,
    fp16 = not torch.cuda.is_bf16_supported(),
    gradient_checkpointing=True,
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.001,
    #seed = 3407,
    lr_scheduler_type = "cosine",
    remove_unused_columns=False,  # needed for multimodal input
    #load_best_model_at_end=True,
    report_to=["tensorboard"],
    run_name=f"{datetime.now().strftime('%m-%d-%H-%M')}"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=collate_fn,
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.23 GiB. GPU 0 has a total capacity of 23.57 GiB of which 2.00 GiB is free. Including non-PyTorch memory, this process has 21.52 GiB memory in use. Of the allocated memory 17.91 GiB is allocated by PyTorch, and 3.30 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)