## Simulating Inference Code

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from PIL import Image
from IPython.display import Markdown, display

import torch
from transformers import AutoModelForCausalLM, AutoProcessor, AutoModel, AutoImageProcessor

model_path = "./VideoLLaMA3-2B/"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
video_path = "basketball.mp4"

conversation = [
    {        
        "role": "user",
        "content": [
            {
                "type": "video", 
                "video": {"video_path": video_path, "fps": 1, "max_frames": 180}
            },
            {
                "type": "text", 
                "text": "Describe the video's environment."
            },
        ]
    }
]


# Single-turn conversation
inputs = processor(conversation=conversation, return_tensors="pt")
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)

output_ids = model.generate(**inputs, max_new_tokens=256)
response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
display(Markdown(response))

In [None]:
image_path = "cat.png"

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image", 
                "image": {"image_path": image_path}
            },
            {
                "type": "text", 
                "text": "What is funny in the image?"},
        ]
    }
]


# Single-turn conversation
inputs = processor(conversation=conversation, return_tensors="pt")
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)

output_ids = model.generate(**inputs, max_new_tokens=256)
response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
display(Markdown(response))



In [27]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import json
import io 
def model_fn(model_dir, context=None): 
    print(f"model_fn called with: model_dir={model_dir}, context={context}")

    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        trust_remote_code=True,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
    return {"model": model, "processor": processor}


def input_fn(request_body, request_content_type):
    if request_content_type != 'application/json':
        raise ValueError(f"Unsupported content type: {request_content_type}")
    
    # Parse the request
    request = json.loads(request_body)
    conversation = request.get("inputs", {}).get("conversation", [])

    # Process the conversation, extracting and encoding images
    for message in conversation:
        if message['role'] == 'user':
            for content in message['content']:
                if content['type'] == 'image':
                    image_data = content['image'].get('data')
                    if not image_data:
                        raise ValueError("Image data is missing in the request.")
                    
                    # Decode the base64 image data
                    image_bytes = base64.b64decode(image_data)
                    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                    
                    # Save the image to a file path or update to use as expected by the processor
                    image_path = "/tmp/image.png"
                    image.save(image_path)
                    
                    # Update the conversation to use the image_path instead of embedding the image directly
                    content['image']['image_path'] = image_path  # Add the image_path

    return conversation

@torch.inference_mode()
def predict_fn(input_data, model_artifacts):
    model = model_artifacts['model']
    processor = model_artifacts['processor']

    inputs = processor(
        conversation=input_data,
        add_system_prompt=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
    
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    return response

# Format output (simulate SageMaker response)
def output_fn(prediction, response_content_type="application/json"):
    return json.dumps({"response": prediction})


In [None]:
import base64
import json

image_path = "cat.png"

# Read and encode image
with open(image_path, "rb") as f:
    image_bytes = f.read()
    image_base64 = base64.b64encode(image_bytes).decode("utf-8")

# Construct JSON payload
data = json.dumps({
    "inputs": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image", 
                        "image": {"data": image_base64}  # Send image data instead of file path
                    },
                    {
                        "type": "text", 
                        "text": "What is funny in the image?"
                    },
                ]
            }
        ]
    }
})


In [29]:
# Simulate request body and content type
request_body = data
request_content_type = "application/json"

# Call input_fn
processed_input = input_fn(request_body, request_content_type)
print("Processed Input:", processed_input)

# Load model locally (if not already loaded)
model_artifacts = model_fn("VideoLLaMA3-2B")

Processed Input: [{'role': 'user', 'content': [{'type': 'image', 'image': {'data': 'iVBORw0KGgoAAAANSUhEUgAAAgMAAAFyCAYAAABoTdmuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOz9x5YkyZauiX1ClBl1EhGZ59S551Tde4vci7Uw6LWwGsDb4EkwwwRjvAgeAMC85+gedBeq6pDMYE6MKRGGwRYzN6dh7uFBMtN3rkg3U1NVUSqy5d///rdKKSVe7MVe7MVe7MVe7Ddr+lsfwIu92Iu92Iu92It9W3txBl7sxV7sxV7sxX7j9uIMvNiLvdiLvdiL/cbtxRl4sRd7sRd7sRf7jduLM/BiL/ZiL/ZiL/Ybtxdn4MVe7MVe7MVe7DduL87Ai73Yi73Yi73Yb9xenIEXe7EXe7EXe7HfuL04Ay/2Yi/2Yi/2Yr9xs4eu+H/5v/0/7lyulLr+/Y5ld613128PrXNtfX3bhzl027tM3zyHRxzPQe0qdfi5HdLmoe1+gfUOtcOu3fP6oko/3zk85hp/r5ZS4hB50ZQSB634iHYP2eGh4qfPvd5jbf86PtRGipGUVL6e8mwnICaVPwfZV/6ntUYphVJASvKPiFLIMhLyJG7/qnxZNYlEIn7ymLb20LO8v32K8c7lD9ld69237ObSR7VxY93HtHvT4qHb7l2PTx7fje+fbnf7nkRQ23b0Hdfpdj+5/47FmG4sv23/z//7//VTp3C4M/BiL3aoHTzIP+NYq553dy/2YtdM7X2S7vaOTlepvcXbATx/23bSKZFCJKSYHYH9Pe/9L+21qu7u4HdbKvXFHKEX+1qm7vn89ew37wy8DCCPs2dFOL4BGvFiL/ZY23+udmPu

In [30]:
# Call predict_fn
prediction = predict_fn(processed_input, model_artifacts)
print("Prediction:", prediction)

Prediction: The image is amusing because it features a kitten with a human-like smiley face drawn on its nose. This creates an unexpected and playful interaction between the animal and the drawing, giving the scene a whimsical quality that evokes laughter or amusement in viewers.


## video

In [32]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import json
import base64
import io
import os

def model_fn(model_dir, context=None): 
    print(f"model_fn called with: model_dir={model_dir}, context={context}")

    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        trust_remote_code=True,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
    return {"model": model, "processor": processor}


def input_fn(request_body, request_content_type):
    if request_content_type != 'application/json':
        raise ValueError(f"Unsupported content type: {request_content_type}")
    
    request = json.loads(request_body)
    conversation = request.get("inputs", {}).get("conversation", [])

    # Process the conversation, extracting and saving images and videos
    for message in conversation:
        if message['role'] == 'user':
            for content in message['content']:
                if content['type'] == 'image':
                    image_data = content['image'].get('data')
                    if not image_data:
                        raise ValueError("Image data is missing in the request.")
                    
                    # Decode the base64 image data
                    image_bytes = base64.b64decode(image_data)
                    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                    
                    # Save the image to a file path
                    image_path = "/tmp/image.png"  # Update path if needed based on your environment
                    image.save(image_path)
                    
                    # Update the conversation to use the image_path instead of embedding the image directly
                    content['image']['image_path'] = image_path  # Add the image_path

                elif content['type'] == 'video':
                    video_data = content['video'].get('data')
                    fps = content['video'].get('fps', 1)  # Default fps to 1 if not provided
                    max_frames = content['video'].get('max_frames', 180)  # Default max_frames to 180 if not provided
                    
                    if not video_data:
                        raise ValueError("Video data is missing in the request.")
                    
                    # Decode the base64 video data
                    video_bytes = base64.b64decode(video_data)
                    
                    # Create a temporary path for the video file
                    video_path = "/tmp/video.mp4"  # Update path if needed based on your environment
                    
                    # Save the video data to the video path
                    with open(video_path, "wb") as video_file:
                        video_file.write(video_bytes)
                    
                    # Update the conversation to use the video_path and include fps and max_frames
                    content['video']['video_path'] = video_path  # Add the video_path
                    content['video']['fps'] = fps  # Add the fps
                    content['video']['max_frames'] = max_frames  # Add the max_frames

    return conversation


@torch.inference_mode()
def predict_fn(input_data, model_artifacts):
    model = model_artifacts['model']
    processor = model_artifacts['processor']

    inputs = processor(
        conversation=input_data,
        add_system_prompt=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    
    inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
    
    output_ids = model.generate(**inputs, max_new_tokens=256)
    response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    return response


# Format output (simulate SageMaker response)
def output_fn(prediction, response_content_type="application/json"):
    return json.dumps({"response": prediction})


In [35]:
import base64
import json

image_path = "basketball.mp4"

# Read and encode image
with open(image_path, "rb") as f:
    image_bytes = f.read()
    image_base64 = base64.b64encode(image_bytes).decode("utf-8")

# Construct JSON payload
data = json.dumps({
    "inputs": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video", 
                        "video": {"data": image_base64}  # 🔥 Send image data instead of file path
                    },
                    {
                        "type": "text", 
                        "text": "What is funny in the image?"
                    },
                ]
            }
        ]
    }
})


In [38]:
# Simulate request body and content type
request_body = data
request_content_type = "application/json"

# Call input_fn
processed_input = input_fn(request_body, request_content_type)
#print("Processed Input:", processed_input)

# Load model locally (if not already loaded)
model_artifacts = model_fn("VideoLLaMA3-2B")

model_fn called with: model_dir=VideoLLaMA3-2B, context=None


Some parameters are on the meta device because they were offloaded to the cpu.
