In [18]:
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
import torch
from pathlib import Path
import json
import os
from PIL import Image
import requests
import matplotlib.pyplot as plt

In [3]:
def load_config():
    config_path = "../config.json"
    with open(config_path, 'r') as f:
        return json.load(f)

config = load_config()
model_path = Path(config['paths']['models']['pretrained']) / "xgen-mm"
model_name = "Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"
print(model_path)

/home/akhassan/manipulation_uncertainty/models/pretrained/xgen-mm


In [10]:
if os.path.isdir(model_path):
    model = AutoModelForVision2Seq.from_pretrained(model_path, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False, legacy=False)
    image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
    print(f"Model and processors loaded from {model_path}")
else:
    model_path.parent.mkdir(parents=True, exist_ok=True)
    model = AutoModelForVision2Seq.from_pretrained(model_name, trust_remote_code=True)
    model.save_pretrained(str(model_path))
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False, legacy=False)
    image_processor = AutoImageProcessor.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = model.update_special_tokens(tokenizer)
    tokenizer.save_pretrained(str(model_path))
    image_processor.save_pretrained(str(model_path))
    print(f"Model and processors saved to {model_path}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and processors loaded from /home/akhassan/manipulation_uncertainty/models/pretrained/xgen-mm


In [12]:
def load_demo_image():
    """Load a sample image for testing"""
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)
    return image

def visualize_image(image):
    """Display the input image"""
    plt.figure(figsize=(10, 8))
    plt.imshow(image)
    plt.axis('off')
    plt.show()

def run_inference(model, tokenizer, image_processor, image, prompt="What do you see in this image?"):
    """Run inference with the model on the given image"""
    # Process the inputs
    inputs = image_processor(images=image, return_tensors="pt")
    text_inputs = tokenizer(prompt, return_tensors="pt")
    
    inputs.update(text_inputs)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False,
        num_beams=5,
    )
    
    # Decode and return the response
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [31]:
import json
import PIL
import textwrap
import IPython.display as display
from IPython.display import Image
import os
from PIL import Image

def apply_prompt_template(prompt):
    s = (
                '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
                "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
                f'<|user|>\n{prompt}<|end|>\n<|assistant|>\n'
            )
    return s 

model = model.cuda().to(torch.bfloat16)

In [32]:
q = {"question": "What do you see?"}

image_list = []
image_sizes = []
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
img = Image.open(requests.get(url, stream=True).raw)

# Move image to correct dtype and device
image_tensor = image_processor([img], image_aspect_ratio='anyres')["pixel_values"]
image_tensor = image_tensor.cuda().bfloat16()  # Ensure BF16 on GPU
image_list.append(image_tensor)
image_sizes.append(img.size)

inputs = {
    "pixel_values": [image_list]
}

for query in q['question']:
    prompt = apply_prompt_template(query)
    language_inputs = tokenizer([prompt], return_tensors="pt")
    
    # Update inputs and move to CUDA in BF16
    inputs.update({k: v.cuda().bfloat16() if isinstance(v, torch.Tensor) else v for k, v in language_inputs.items()})
    
    # Run model generation
    generated_text = model.generate(
        **inputs, 
        image_size=[image_sizes],
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        temperature=0.05,
        do_sample=False, 
        max_new_tokens=1024, 
        top_p=None, 
        num_beams=1,
    )

    prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True).split("<|end|>")[0]
    print("User: ", query)
    print("Assistant: ", prediction)

print("-" * 120)


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got CUDABFloat16Type instead (while checking arguments for embedding)