In [None]:
!pip install fastmcp

Collecting fastmcp
  Downloading fastmcp-2.10.5-py3-none-any.whl.metadata (17 kB)
Collecting authlib>=1.5.2 (from fastmcp)
  Downloading authlib-1.6.0-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting cyclopts>=3.0.0 (from fastmcp)
  Downloading cyclopts-3.22.2-py3-none-any.whl.metadata (11 kB)
Collecting exceptiongroup>=1.2.2 (from fastmcp)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting mcp>=1.10.0 (from fastmcp)
  Downloading mcp-1.11.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openapi-pydantic>=0.5.1 (from fastmcp)
  Downloading openapi_pydantic-0.5.1-py3-none-any.whl.metadata (10 kB)
Collecting python-dotenv>=1.1.0 (from fastmcp)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting rich-rst<2.0.0,>=1.3.1 (from cyclopts>=3.0.0->fastmcp)
  Downloading rich_rst-1.3.1-py3-none-any.whl.metadata (6.0 k

In [None]:
!pip install qwen_vl_utils

Collecting qwen_vl_utils
  Downloading qwen_vl_utils-0.0.11-py3-none-any.whl.metadata (6.3 kB)
Collecting av (from qwen_vl_utils)
  Downloading av-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Downloading qwen_vl_utils-0.0.11-py3-none-any.whl (7.6 kB)
Downloading av-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (39.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.7/39.7 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen_vl_utils
Successfully installed av-15.0.0 qwen_vl_utils-0.0.11


In [None]:
pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.12-py3-none-any.whl (26 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.12


In [None]:

# 🛠 2. Set up ngrok (replace with your own token)
NGROK_TOKEN = "2zfc19GGusf8X8tanJrTodzeneJ_6oCzBYUnsTkjH7wekVz8E"

from pyngrok import ngrok, conf
conf.get_default().auth_token = NGROK_TOKEN

# 📦 3. Minimal MedVLM singleton + FastMCP server
import io, uuid, base64, threading
from typing import Annotated
from fastmcp import FastMCP, Context
from PIL import Image
import torch
from transformers import (
    AutoProcessor, Qwen2VLForConditionalGeneration, GenerationConfig,
)
from qwen_vl_utils import process_vision_info
MODEL_ID  = "JZPeterPan/MedVLM-R1"
_DEVICE   = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class _MedVLM:
    _model, _proc = None, None
    _gen_cfg = GenerationConfig(max_new_tokens=256, do_sample=False,
                                temperature=1.0, pad_token_id=151_643)
    @classmethod
    def _load(cls):
        if cls._model: return
        cls._model = Qwen2VLForConditionalGeneration.from_pretrained(
            MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto",
            # attn_implementation="flash_attention_2",
        ).eval()
        cls._proc  = AutoProcessor.from_pretrained(MODEL_ID)

    @classmethod
    def infer(cls, img_bytes: bytes, question: str) -> str:
        from qwen_vl_utils import process_vision_info          # same helper as earlier
        cls._load()
        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
        payload = [{"role":"user","content":[
            {"type":"image","image":img},
            {"type":"text","text":f"{question or 'Describe the medically relevant findings.'}\n\n"
                                  "1. Think step-by-step inside <think>…</think>.\n"
                                  "2. Put the answer inside <answer>…</answer>."}
        ]}]
        text = cls._proc.apply_chat_template(payload, tokenize=False,
                                             add_generation_prompt=True)
        imgs, vids = process_vision_info(payload)
        inputs = cls._proc(text=text, images=imgs, videos=vids,
                           return_tensors="pt", padding=True).to(_DEVICE)
        out = cls._model.generate(**inputs, generation_config=cls._gen_cfg,
                                  use_cache=True)[0][len(inputs.input_ids[0]):]
        return cls._proc.decode(out, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False)

# Load model
print("Start loading model")
_MedVLM = _MedVLM()
_MedVLM._load()
print("Finished loading model")
# ---------- Fast-MCP wiring ----------
mcp = FastMCP(name="MedVLM-Server")
_blob: dict[str, bytes] = {}

@mcp.resource("upload://{id}")
def read_blob(id: str, ctx: Context) -> bytes:
    # if id not in _blob: raise FileNotFound(f"unknown {id}")
    return _blob[id]

@mcp.tool
def upload_image(b64: Annotated[str, "base-64 file bytes"]) -> str:
    bid = str(uuid.uuid4()); _blob[bid] = base64.b64decode(b64)
    return f"upload://{bid}"



@mcp.tool
def medvlm_infer(image_uri: Annotated[str, "`upload://…`"],
                 question: Annotated[str|None, "optional prompt"]=None) -> str:
    bid = image_uri.removeprefix("upload://")
    # if bid not in _blob: raise FileNotFound(f"unknown {image_uri}")
    return _MedVLM.infer(_blob[bid], question or "")

# 🔌 4. Run server in background
def _serve(): mcp.run(transport="http", host="0.0.0.0",port=8000)
threading.Thread(target=_serve, daemon=True).start()
# 🌐 5. Expose via ngrok
public_url = ngrok.connect(8000, "http").public_url
print("🚀 Public MCP endpoint:", public_url)

Start loading model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json: 0.00B [00:00, ?B/s]

Finished loading model
Downloading ngrok ...

  from websockets.server import WebSocketServerProtocol
INFO:     Started server process [195]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


🚀 Public MCP endpoint: https://471284188f4a.ngrok-free.app


In [None]:
# !pip uninstall -y flash-attn
# !pip install flash-attn --no-build-isolation --no-clean