In [None]:
!pip install huggingface_hub transformers==4.41.0 --upgrade --quiet

#### Deploying Huggingface model with vLLM Docker 

This is the official sample from vLLM using docker. This will download the model Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4 directly

https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html

In [None]:
# Open the terminal and run the docker run command
docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:v0.6.3  \
    --model Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4 \
    --max-model-len 8192

#### Deploying local model with vLLM Docker 

##### Download the model from Hugging Face 

In [1]:
from huggingface_hub import snapshot_download
from pathlib import Path

model_name = "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4"

# - This will download the model into the current directory where ever the jupyter notebook is running
local_model_path = Path("./vllm-model")
local_model_path.mkdir(exist_ok=True)
# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model", "*.safetensors"]

# - Leverage the snapshot library to donload the model since the model is stored in repository using LFS
model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
    revision="main",  # Specify branch/tag
)

Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 10.5M/3.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 10.5M/2.94G [00:00<?, ?B/s]

##### Open the terminal and run the docker run command

In [None]:
# You can run this with terminal 
docker run --runtime nvidia --gpus all \
    -v /home/ec2-user/SageMaker/docker-server/vllm-model/models--Qwen--Qwen2-VL-7B-Instruct-GPTQ-Int4/:/opt/ml/Qwen2-VL-7B-Instruct-GPTQ-Int4 \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:v0.6.3  \
    --model /opt/ml/Qwen2-VL-7B-Instruct-GPTQ-Int4/snapshots/dec510a35a3e9b6481b6427c7a08984df2402535 \
    --max-model-len 8192

Once loaded, you will see the log like following 

INFO 11-18 04:21:11 metrics.py:345] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.

INFO 11-18 04:22:13 metrics.py:345] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 44.5 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 3.9%, CPU KV cache usage: 0.0%.

### Testing the inference

In [2]:
!pip install OpenAI

[0m

In [4]:
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

In [None]:
import requests
import json
import base64


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Replace with your image path
test_image = "income_statement.jpg"
base64_image = encode_image(test_image)

url = "http://localhost:8000/invocations"
headers = {"Content-Type": "application/json"}

request = {
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please generate accurate HTML code that represents the table structure shown in input image, including any merged cells."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ]
        }
    ],
    "max_tokens": 1024
}

completion = client.chat.completions.create(model="/opt/ml/Qwen2-VL-7B-Instruct-GPTQ-Int4/snapshots/dec510a35a3e9b6481b6427c7a08984df2402535",
                                       messages=request["messages"],
                                       max_tokens=request["max_tokens"])


print("Completion result:", completion)

Completion result: ChatCompletion(id='chat-305cffe00cc940c29dcde87a9eda181d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The image is a Management Discussion and Analysis (MD&A) document comparing the financial performance of a company for the years ended December 31, 2022 and December 31, 2021. The document presents the comparative figures for various financial metrics in millions of RMB. Here is a detailed breakdown of the contents:\n\n## Year Ended December 31, 2022 Compared to Year Ended December 31, 2021\n\n### Financial Metrics\n- **Revenue**: 280,044.0 (2022) vs. 328,309.1 (2021)\n- **Cost of Sales**: (232,466.8) (2022) vs. (270,048.2) (2021)\n- **Gross Profit**: 47,577.2 (2022) vs. 58,260.9 (2021)\n- **Research and Development Expenses**: (16,028.1) (2022) vs. (13,167.1) (2021)\n- **Selling and Marketing Expenses**: (21,323.3) (2022) vs. (20,980.8) (2021)\n- **Administrative Expenses**: (5,113.9) (2022) vs. (4,738.9) (20