In [1]:
import os
import PIL.Image
import torch
import numpy as np
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from torch.quantization import quantize_dynamic  # Import quantize_dynamic


Python version is above 3.10, patching the collections module.




In [2]:
# 1. Load the model (as you already do), but DO NOT move it to bfloat16 or CUDA yet
model_path = "deepseek-ai/Janus-Pro-7B"

In [3]:

# vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
# tokenizer = vl_chat_processor.tokenizer

# vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
#     model_path, trust_remote_code=True
# )

# # 2. Dynamic Quantization (Easiest, but less effective)
# # Create a *new* instance of the model for quantization
# vl_gpt_quantized_dynamic = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) # New instance

# vl_gpt_quantized_dynamic = quantize_dynamic(
#     vl_gpt_quantized_dynamic, {torch.nn.Linear}, dtype=torch.qint8
# )

# # Move the quantized model to CUDA and eval mode *after* quantization
# vl_gpt_quantized_dynamic = vl_gpt_quantized_dynamic.to('cuda').eval()

# # 3. (Optional, but highly recommended) Post-Training Static Quantization (More effective)
# # ... (Your existing static quantization code, but create a new model instance for it too)
# vl_gpt_quantized_static = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) # New instance


# # ... (Rest of your code for static quantization, making sure to use vl_gpt_quantized_static)

# # 4. Use the quantized model for inference
# vl_gpt_inference = vl_gpt_quantized_dynamic  # Or vl_gpt_quantized_static, as appropriate

# # ... (Your generation loop using vl_gpt_inference)

# # ... (Saving and loading - remember to create a new model instance before loading the state_dict)

In [4]:
# import torch
# from torch.quantization import quantize_dynamic

# # ... (Load model and processor as before)

# # 2. Dynamic Quantization (or Static, choose one)
# vl_gpt_quantized_dynamic = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# vl_gpt_quantized_dynamic = quantize_dynamic(vl_gpt_quantized_dynamic, {torch.nn.Linear}, dtype=torch.qint8)
# vl_gpt_quantized_dynamic.to('cuda').eval() # Move quantized model to CUDA

# # OR (if you have calibration data)

# # 3. Static Quantization (Recommended if you have calibration data)
# vl_gpt_quantized_static = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# # ... (Your static quantization code, using vl_gpt_quantized_static)
# vl_gpt_quantized_static.to('cuda').eval() # Move quantized model to CUDA

# # 4. Save the quantized model (choose dynamic or static)
# # Save the *entire* model (architecture and weights) – easier to load
# # Choose the appropriate quantized model to save
# # torch.save(vl_gpt_quantized_dynamic, "quantized_model_dynamic.pth")  # For dynamic quantization
# torch.save(vl_gpt_quantized_static, "quants/quantized_model_static.pth")  # For static quantization

In [5]:
# --- Later, when loading on a machine with less RAM/VRAM ---

# Load the processor (same as before)
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)

# Load the quantized model (choose dynamic or static, matching what you saved)
# IMPORTANT: Load on CPU first, then move to GPU if available.

# Dynamic
# vl_gpt_loaded_dynamic = torch.load("quantized_model_dynamic.pth", map_location=torch.device('cpu'))

# Static
if torch.cuda.is_available():
    vl_gpt_loaded_static = torch.load("quants/quantized_model_static.pth", map_location=torch.device('cpu'))
else:
    print("No CUDA GPU found, goodbye.")

# ... (Your generation loop using vl_gpt_inference)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Some kwargs in processor config are unused and will not have any effect: sft_format, num_image_tokens, image_tag, ma

In [6]:
conversation = [
    {
        "role": "<|User|>",
        "content": "Dinosaurs portrait in blue clothes, and smiling with black eyes",
    },
    {"role": "<|Assistant|>", "content": ""},
]

sft_format = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
    conversations=conversation,
    sft_format=vl_chat_processor.sft_format,
    system_prompt="",
)
prompt = sft_format + vl_chat_processor.image_start_tag

In [8]:
import time

start= time.time()
@torch.inference_mode()
def generate(
    mmgpt: MultiModalityCausalLM,
    vl_chat_processor: VLChatProcessor,
    prompt: str,
    temperature: float = 1,
    parallel_size: int = 16,
    cfg_weight: float = 5,
    image_token_num_per_image: int = 576,
    img_size: int = 384,
    patch_size: int = 16,
):
    input_ids = vl_chat_processor.tokenizer.encode(prompt)
    input_ids = torch.LongTensor(input_ids)

    tokens = torch.zeros((parallel_size*2, len(input_ids)), dtype=torch.int).cpu()
    for i in range(parallel_size*2):
        tokens[i, :] = input_ids
        if i % 2 != 0:
            tokens[i, 1:-1] = vl_chat_processor.pad_id

    inputs_embeds = mmgpt.language_model.get_input_embeddings()(tokens)

    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cpu()

    for i in range(image_token_num_per_image):
        outputs = mmgpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=outputs.past_key_values if i != 0 else None)
        hidden_states = outputs.last_hidden_state
        
        logits = mmgpt.gen_head(hidden_states[:, -1, :])
        logit_cond = logits[0::2, :]
        logit_uncond = logits[1::2, :]
        
        logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
        probs = torch.softmax(logits / temperature, dim=-1)

        next_token = torch.multinomial(probs, num_samples=1)
        generated_tokens[:, i] = next_token.squeeze(dim=-1)

        next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
        img_embeds = mmgpt.prepare_gen_img_embeds(next_token)
        inputs_embeds = img_embeds.unsqueeze(dim=1)


    dec = mmgpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int), shape=[parallel_size, 8, img_size//patch_size, img_size//patch_size])
    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)

    dec = np.clip((dec + 1) / 2 * 255, 0, 255)

    visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
    visual_img[:, :, :] = dec

    os.makedirs('dinozauri_1', exist_ok=True)
    for i in range(parallel_size):
        save_path = os.path.join('dinozauri_1', "img_{}.jpg".format(i))
        PIL.Image.fromarray(visual_img[i]).save(save_path)


generate(
    vl_gpt_loaded_static,
    vl_chat_processor,
    prompt,
)
time.time() - start

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)