In [1]:
import os; os.environ['LD_LIBRARY_PATH'] = "/share/softwares/cuda_cudnn/cuda-12.1/lib64:/lib/x86_64-linux-gnu:" + os.environ.get('LD_LIBRARY_PATH', '')


# Dataset

In [2]:
import numpy as np

In [3]:
import datasets
import os

split = 'train'

PIXMO_DATASETS = '/share/users/shehan/workspace_pointing_lmm/data/torch_datasets/pixmo_datasets'

pixmo_points_dataset = datasets.load_from_disk( os.path.join(PIXMO_DATASETS, f"points-pointing"))[split]

In [4]:
pixmo_points_dataset

Dataset({
    features: ['image_url', 'image', 'points', 'count', 'label', 'collection_method'],
    num_rows: 127405
})

In [5]:
len(pixmo_points_dataset)

127405

In [6]:
idx = 0
ex = pixmo_points_dataset[idx]

In [7]:
ex

{'image_url': 'http://fahum.umsu.ac.id/wp-content/uploads/2019/04/KARATE.jpg',
 'image': '../data/torch_datasets/pixmo_images/c0d2bd4805501b23e1cec10ef72a6047267d997bc3d5614892f5827dbb2d173e',
 'points': [[{'x': 34.277329420396185, 'y': 20.7860838929869}],
  [{'x': 92.09791441294121, 'y': 12.201521414953659},
   {'x': 80.82719062114771, 'y': 9.543764275062763},
   {'x': 50.87926740295353, 'y': 66.08150706910546},
   {'x': 83.88638707892022, 'y': 74.29639277422278}],
  [{'x': 17.228106367455783, 'y': 94.59199275157144}],
  [{'x': 42.34571938916702, 'y': 11.718292844064406}],
  [{'x': 23.346499283000828, 'y': 46.99397851897994},
   {'x': 39.447533271277265, 'y': 54.725635653207995},
   {'x': 59.734836096505575, 'y': 48.68527851709232},
   {'x': 74.70879770560266, 'y': 49.16850708798158}],
  [{'x': 20.287302825228306, 'y': 41.67846423919814},
   {'x': 34.61722307479433, 'y': 41.67846423919814},
   {'x': 54.58250522025712, 'y': 39.020707099307245},
   {'x': 71.00555988829908, 'y': 37.57102

In [8]:
from PIL import Image

data_list = []
for label, points in zip(ex["label"], ex["points"]):
    data_list.append(dict(
        image = Image.open(ex["image"]),
        label=label,
        points=np.stack([[x["x"] for x in points], [x["y"] for x in points]], -1),
        point_scale=100,
        style='point_count'
    ))
    


In [9]:
# data_list

In [10]:
from datasets import Dataset, Features, Value, Sequence, Image

# Define features for the dataset.
features = Features({
    'image': Image(),  # Handles PIL images, numpy arrays, or image file paths.
    'points': Sequence(feature=Sequence(Value('float32'))),  # Nested sequences for (num_points, 2)
    'label': Value('string'),
    'point_scale': Value('int64'),
    'style': Value('string'),
})

# data_list is your list where each element is a dict with keys "image", "points", and "label".
dataset = Dataset.from_list(data_list, features=features)



In [11]:
dataset

Dataset({
    features: ['image', 'label', 'points', 'point_scale', 'style'],
    num_rows: 16
})

# Model

In [12]:

import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'
# export CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [13]:
import torch

from molmo.model import MolmoForCausalLM
from molmo.preprocessor import MolmoProcessor

In [14]:
model_id = "allenai/Molmo-7B-D-0924"

In [15]:

# load the processor
# from transformers import AutoProcessor
# processor = AutoProcessor.from_pretrained(
#     model_id,
#     trust_remote_code=True,
#     device_map='auto',
#     torch_dtype='auto'
# )

processor = MolmoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map='auto',
)

In [16]:
USE_QLORA = True

from transformers import BitsAndBytesConfig


# load the model
if USE_QLORA:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = MolmoForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
    )
else:
    model = MolmoForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
    )

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [17]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration. By setting target_modules to ["att_proj", "ff_proj"],
# only the transformer (LLM) portion will have LoRA adapters.
lora_config = LoraConfig(
    task_type="CAUSAL_LM",  # for causal language modeling
    r=8,                    # LoRA rank (adjust as needed)
    lora_alpha=8,          # scaling factor
    lora_dropout=0.1,       # dropout probability for LoRA layers
    target_modules=["att_proj", "ff_proj"]
)

# Wrap the model with LoRA (this only affects modules matching the target names)
model = get_peft_model(model, lora_config)

# Optional: print out trainable parameters to verify that only the LLM part is modified.
model.print_trainable_parameters()


trainable params: 11,124,736 || all params: 8,032,150,016 || trainable%: 0.1385


In [18]:
from transformers import AutoTokenizer, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Training

In [19]:
# # preprocess function

# def process(examples):
#     texts = [f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>{example['question']} Answer briefly. <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{example['multiple_choice_answer']}<|eot_id|>" for example in examples]
#     images = [[example["image"].convert("RGB")] for example in examples]

#     batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
#     labels = batch["input_ids"].clone()
#     labels[labels == processor.tokenizer.pad_token_id] = -100 
#     labels[labels == 128256] = -100 # image token index
#     batch["labels"] = labels
#     batch = batch.to(torch.bfloat16).to("cuda")

#     return batch

In [20]:
def get_prompt(label, points):
    # Let's use chat template to format the prompt correctly
    # conversation = [
    #         {
    #             "role": "user",
    #             "content": [
    #                 {"type": "text", "text": f"Point to {label}."}, #TODO
    #                 {"type": "video"},
    #                 ],
    #         },
    #         {
    #             "role": "assistant",
    #             "content": [
    #                 {"type": "text", "text": str(points)}, #TODO
    #                  ],
    #         },
    #     ]
    
    conversation = [
            {
                "role": "user",
                "content": f"Point to {label}." , #TODO
            },
            {
                "role": "assistant",
                "content": str(points), #TODO
            },
        ]

    prompt = processor.apply_chat_template(
                conversation, 
                chat_template=tokenizer.chat_template,
                add_generation_prompt=False
            )
    return prompt

In [21]:
get_prompt("car", [[34.27732849121094, 20.786083221435547]])

'User: Point to car. Assistant: [[34.27732849121094, 20.786083221435547]]'

In [22]:
tokenizer.chat_template

"{% for message in messages -%}\n        {%- if (loop.index % 2 == 1 and message['role'] != 'user') or \n          (loop.index % 2 == 0 and message['role'].lower() != 'assistant') -%}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n        {%- endif -%}\n        {{ message['role'].capitalize() + ': ' + message['content'] }}\n        {%- if not loop.last -%}\n        {{ ' ' }}\n        {%- endif %}\n        {%- endfor -%}\n        {%- if add_generation_prompt -%}\n        {{ ' Assistant:' }}\n        {%- endif %}"

In [23]:
DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
DEFAULT_IM_START_TOKEN = f"<im_start>"
DEFAULT_IM_END_TOKEN = f"<im_end>"
DEFAULT_IM_COL_TOKEN = f"<im_col>"
IMAGE_PROMPT = "<|image|>"

In [24]:
def process(example):
    prompt = get_prompt(example["label"], example["points"])
    image = example["image"]
    
    if image:
        inputs = processor.process(
            images=[image],
            text=prompt
        )
    else:
        inputs = processor.process(
            text=prompt
        )
        
    # dtype = torch.bfloat16
    device = "cuda"
        
    # inputs = inputs.to(torch.bfloat16).to("cuda")
    # inputs = {k: v.to(dtype).to(device).unsqueeze(0) for k, v in inputs.items()} # add batch dimension
    inputs = {k: v.to(device).unsqueeze(0) for k, v in inputs.items()} # add batch dimension
    inputs["labels"] = inputs["input_ids"].clone()
    inputs["labels"][inputs["labels"] == processor.tokenizer.pad_token_id] = -100
    # inputs["labels"][inputs["labels"] == processor.special_token_ids["<|image|>"]] = -100 # image token index
    inputs["labels"][inputs["labels"] == processor.special_token_ids[IMAGE_PROMPT]] = -100
    inputs["labels"][inputs["labels"] == processor.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]] = -100
    inputs["labels"][inputs["labels"] == processor.special_token_ids[DEFAULT_IM_START_TOKEN]] = -100
    inputs["labels"][inputs["labels"] == processor.special_token_ids[DEFAULT_IM_END_TOKEN]] = -100
    inputs["labels"][inputs["labels"] == processor.special_token_ids[DEFAULT_IM_COL_TOKEN]] = -100
    
    
    return inputs

In [25]:
processor.special_token_ids

{'<im_start>': 152064,
 '<im_end>': 152065,
 '<im_patch>': 152066,
 '<im_col>': 152067,
 '<|image|>': 152068}

In [26]:
def collate_fn(examples):
    '''
    examples:
    
        input_ids torch.Size([B, 1249])
        images torch.Size([B, 13, 576, 588])
        image_input_idx torch.Size([B, 13, 144])
        image_masks torch.Size([B, 13, 576])
        labels torch.Size([B, 1249])
        
    '''
    
    # examples has keys: input_ids', 'images', 'image_input_idx', 'image_masks', 'labels'
    
    padded_inputs = tokenizer.pad(
        {
            "input_ids": [example["input_ids"][0] for example in examples],
        },
        padding=True,
        return_tensors="pt",
    )
    
    labels = padded_inputs["input_ids"].clone()
    labels[labels == tokenizer.pad_token_id] = -100
    
    # TODO
    labels[labels == processor.special_token_ids[IMAGE_PROMPT]] = -100
    labels[labels == processor.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]] = -100
    labels[labels == processor.special_token_ids[DEFAULT_IM_START_TOKEN]] = -100
    labels[labels == processor.special_token_ids[DEFAULT_IM_END_TOKEN]] = -100
    labels[labels == processor.special_token_ids[DEFAULT_IM_COL_TOKEN]] = -100
    
    padded_inputs["labels"] = labels
    
    padded_inputs["images"] = torch.cat([example["images"] for example in examples], 0)
    padded_inputs["image_input_idx"] = torch.cat([example["image_input_idx"] for example in examples], 0)
    padded_inputs["image_masks"] = torch.cat([example["image_masks"] for example in examples], 0)
    
    padded_inputs = padded_inputs.to(torch.bfloat16).to("cuda")
    
    return padded_inputs
    
    # padded_inputs = tokenizer.pad(
    #     {
    #         "input_ids": [example["input_ids"] for example in examples],
    #     },
    #     padding=True,
    #     return_tensors="pt",
    # )
    
    # labels = padded_inputs["input_ids"].clone()
    # labels[labels == tokenizer.pad_token_id] = -100
    # padded_inputs["labels"] = labels
    
    # # padded_inputs["images"] = torch.cat([example["images"] for example in examples], 0)
    # padded_inputs["images"] = torch.cat([torch.tensor(example["images"]) for example in examples], dim=0)
    # padded_inputs["image_input_idx"] = torch.cat([example["image_input_idx"] for example in examples], 0)
    # padded_inputs["image_masks"] = torch.cat([example["image_masks"] for example in examples], 0)
    
    # return padded_inputs
    

In [27]:
# map dataset with process function
dataset_mapped = dataset.map(process, batched=False, batch_size=4, remove_columns=dataset.column_names)

dataset_mapped = dataset_mapped.with_format("torch", device="cuda")

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [28]:
# process(dataset[0])['input_ids'].to(int)

In [29]:
# dataset_mapped

In [30]:
dataset_mapped[0]

{'input_ids': tensor([[151643, 152064, 152066,  ...,   5053,  21388,     25]],
        device='cuda:0'),
 'images': tensor([[[[-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           [-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           [-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           ...,
           [-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           [-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           [-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802]],
 
          [[-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           [-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           [-1.7923, -1.7521, -1.4802,  ..., -1.7923, -1.7521, -1.4802],
           ...,
           [ 1.8865, -0.2513, -0.7550,  ...,  1.9019, -0.1943, -0.7327],
           [ 1.8772, -0.2609, -0.7641,  ...,  1.9119, -0.2261, -0.7241],
           [ 1.8865, -0.2513, -0.7550,  ...,  1

In [31]:
labels = dataset_mapped[0]['labels']


In [32]:
# Check overall min and max in the labels tensor
min_label = labels.min().item()
max_label = labels.max().item()
print("Overall minimum label value:", min_label)
print("Overall maximum label value:", max_label)

# Define your vocabulary size (adjust this value as needed)
vocab_size = tokenizer.vocab_size
print("Vocabulary size:", vocab_size)

# Filter out the ignore_index values (commonly -100)
valid_labels = labels[labels != -100]

if valid_labels.numel() > 0:
    valid_min = valid_labels.min().item()
    valid_max = valid_labels.max().item()
    print("Minimum valid label value:", valid_min)
    print("Maximum valid label value:", valid_max)
else:
    print("No valid labels found (all labels might be set to ignore index).")

# Check if any valid label is out-of-range
if ((valid_labels < 0) | (valid_labels >= vocab_size)).any():
    print("There are label values out of the valid range [0, {}].".format(vocab_size - 1))
else:
    print("All label values are within the valid range [0, {}].".format(vocab_size - 1))

Overall minimum label value: -100
Overall maximum label value: 21388
Vocabulary size: 151643
Minimum valid label value: 11
Maximum valid label value: 21388
All label values are within the valid range [0, 151642].


# Training Loop

In [None]:
from transformers import TrainingArguments
training_args=TrainingArguments(
            num_train_epochs=10,
            remove_unused_columns=False,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=1,
            warmup_steps=1,
            learning_rate=2e-5,
            weight_decay=1e-6,
            adam_beta2=0.999,

            save_strategy="no", #TODO: change to 'steps' or 'epoch'
            # optim="adamw_hf",
            optim="adamw_torch",
            push_to_hub=True,
            save_total_limit=1,
            bf16=True,
            output_dir="./lora-molmo-pixmo",
            
            logging_strategy="steps",
            report_to="wandb",
            logging_steps=10,
            
            dataloader_pin_memory=False,
        )

In [34]:
from transformers import Trainer
trainer = Trainer(
        model=model,
        train_dataset=dataset_mapped,
        data_collator=collate_fn,
        args=training_args
        )

[2025-02-24 20:58:38,601] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [35]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mshehanmunasinghe[0m ([33mshehanmunasinghe-mbzuai[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111295639226834, max=1.0)…

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Attempting to cast a BatchEncoding to type torch.bfloat16. This is not supported.
Attempting to cast a BatchEncoding to type torch.bfloat16. This is not supported.


>> images.shape torch.Size([4, 13, 576, 588])
>> image_features.shape torch.Size([4, 13, 576, 2048])
>> cls_embed.shape torch.Size([4, 13, 2048])


OutOfMemoryError: CUDA out of memory. Tried to allocate 754.00 MiB. GPU 0 has a total capacity of 39.50 GiB of which 697.88 MiB is free. Including non-PyTorch memory, this process has 38.79 GiB memory in use. Of the allocated memory 38.01 GiB is allocated by PyTorch, and 292.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# model.dtype