In [None]:
import sys
from pathlib import Path
import os

%cd ./nanovlm-lab/
project_root = Path.cwd()

nanovlm_root = project_root / "nanovlm"

# Add paths in the correct order
if str(nanovlm_root) not in sys.path:
    sys.path.insert(0, str(nanovlm_root))  # Add nanoVLM FIRST
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))  # Then add project root

# Change working directory to nanovlm so relative imports work
os.chdir(nanovlm_root)

print(f"Project root: {project_root}")
print(f"NanoVLM root: {nanovlm_root}")
print(f"Current directory: {os.getcwd()}")

/workspace/RL-VLM-Lab
Project root: /workspace/RL-VLM-Lab
NanoVLM root: /workspace/RL-VLM-Lab/nanovlm
Current directory: /workspace/RL-VLM-Lab/nanovlm


In [2]:
from nanovlm.data.datasets import VQADataset
from nanovlm.data.collators import VQACollator
from nanovlm.data.data_utils import synchronized_dataloader_step
from nanovlm.data.advanced_datasets import ConstantLengthDataset
from nanovlm.data.processors import get_image_processor, get_tokenizer

import nanovlm.models.config as config
from nanovlm.models.vision_language_model import VisionLanguageModel

# Libraries
import math
import time
import torch
from tqdm import tqdm
import torch.optim as optim
import matplotlib.pyplot as plt
from dataclasses import dataclass, field
from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, get_dataset_config_names

#Otherwise, the tokenizer will throw a warning
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print(f"Using device: {device}")

torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

# To reload the modules if you change something in the code
%reload_ext autoreload
%autoreload 2

Using device: cuda


In [3]:
from datasets import load_dataset
dataset_id = 'lmms-lab/multimodal-open-r1-8k-verified'
dataset = load_dataset(dataset_id, split='train[:5%]')

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [4]:
user_text = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> reasoning process here </think><answer> answer here </answer>"
)
user_text+". "+train_dataset[0]["problem"]

'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Based on the image, determine the constant term after combining all the polynomial expressions representing the side lengths of the triangle. Choose the correct answer from the options provided.\n\nChoices:\nA. 3\nB. 5\nC. 8\nD. 13'

In [5]:
# Then convert to RGB
def convert_to_rgb(example):
    image = example["image"]
    if image.mode != "RGB":
        image = image.convert("RGB")
    example["image"] = image
    return example

train_dataset = train_dataset.map(convert_to_rgb)

def convert_fn(batch):
    texts_list = []

    for q, ans in zip(batch["problem"], batch["solution"]):

        user_text = (
            "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
            "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
            "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
            "<think> reasoning process here </think><answer> answer here </answer>"
        )

        assistant_text = f"Answer: {ans}"

        # Single-element list, with a single dictionary containing both messages
        texts_list.append([
            {
                "user": user_text+". "+q,
                "assistant": assistant_text
            }
        ])

    return {"texts": texts_list}
train_dataset = train_dataset.map(
    convert_fn,
    batched=True,
    num_proc=4,
)
test_dataset = test_dataset.map(
    convert_fn,
    batched=True,
    num_proc=4,
)

In [7]:
from nanovlm.data.processors import get_tokenizer, get_image_processor
from nanovlm.models.vision_language_model import VisionLanguageModel
from rlvlm.nanovlm_sft_trainer import create_sft_dataset, NanoVLMSFTConfig, NanoVLMSFTTrainer, NanoVLMSFTDataCollator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

# Load model
model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-230M-8k")

# Get processors
tokenizer = get_tokenizer(model.cfg.lm_tokenizer, model.cfg.vlm_extra_tokens, model.cfg.lm_chat_template)
image_processor = get_image_processor(model.cfg.max_img_size, model.cfg.vit_img_size, model.cfg.resize_to_max_side_len)
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    use_dora=False,
    init_lora_weights="gaussian"
)
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()


Resize to max side len: True
trainable params: 4,331,520 || all params: 232,395,456 || trainable%: 1.8639


In [8]:
# Load and prepare dataset
train_dataset = create_sft_dataset(train_dataset, tokenizer, image_processor)
test_dataset = create_sft_dataset(test_dataset, tokenizer, image_processor)

# Configure training
config = NanoVLMSFTConfig(
    output_dir="./sft-output",
    lr_mp=5e-3,
    lr_vision_backbone=0,  # Freeze vision
    lr_language_backbone=5e-5,
    bf16=True,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=10,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
)

# Create trainer and train
trainer = NanoVLMSFTTrainer(
    model=peft_model,
    args=config,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
import argparse
import torch
from PIL import Image
from peft import PeftModel

from models.vision_language_model import VisionLanguageModel
from data.processors import get_tokenizer, get_image_processor, get_image_string
torch.manual_seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)
    device = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

source = "./checkpoints"
print(f"Loading weights from: {source}")

# LoRA fine-tuned variant
base_model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-230M-8k")
model = PeftModel.from_pretrained(base_model, "checkpoints").to(device)
model.eval()  # Set to evaluation mode

# Get tokenizer and image processor from model config if not provided
tokenizer = get_tokenizer(model.cfg.lm_tokenizer, model.cfg.vlm_extra_tokens, model.cfg.lm_chat_template)
resize_to_max_side_len = False
if hasattr(model.cfg, "resize_to_max_side_len"):
    resize_to_max_side_len = model.cfg.resize_to_max_side_len
image_processor = get_image_processor(model.cfg.max_img_size, model.cfg.vit_img_size, resize_to_max_side_len)

In [None]:
img = test_dataset[0]["image"]
processed_image, splitted_image_ratio = image_processor(img)
if not hasattr(tokenizer, "global_image_token") and splitted_image_ratio[0]*splitted_image_ratio[1] == len(processed_image) - 1:
    # If the tokenizer doesn't have a global image token, but the processor generated it, remove it
    processed_image = processed_image[1:]

image_string = get_image_string(tokenizer, [splitted_image_ratio], model.cfg.mp_image_token_length)

messages = [{"role": "user", "content": image_string + user_text+". "+test_dataset[0]["problem"]}]
encoded_prompt = tokenizer.apply_chat_template([messages], tokenize=True, add_generation_prompt=True)
tokens = torch.tensor(encoded_prompt).to(device)
img_t = processed_image.to(device)

print("\nInput:\n ", user_text+". "+test_dataset[0]["problem"], "\n\nOutput:")
for i in range(5):
    gen = model.generate(tokens, img_t, max_new_tokens=1024)
    out = tokenizer.batch_decode(gen, skip_special_tokens=True)[0]
    print(f"  >> Generation {i+1}: {out}")


In [None]:
!zip -r checkpoints.zip checkpoints/