<a href="https://colab.research.google.com/github/andli28/SSM_experiment/blob/main/vLLM_Sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fastapi nest-asyncio pyngrok uvicorn
!pip install vllm



# Load and Run Model in the background

any model from https://docs.vllm.ai/en/latest/models/supported_models.html#hugging-face-hub should work


In [None]:
# Load and run the model:
import subprocess
import time
import os

# Start vllm server in the background
vllm_process = subprocess.Popen([
    'vllm',
    'serve',  # Subcommand must follow vllm
    'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
    '--trust-remote-code',
    '--dtype', 'half',
    '--max-model-len', '16384', # This is max token input and output that you send and retrieve
    '--enable-chunked-prefill',
    '--tensor-parallel-size', '1'
], stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True)

In [None]:
import requests

def check_vllm_status():
    try:
        response = requests.get("http://localhost:8000/health")
        if response.status_code == 200:
            print("vllm server is running")
            return True
    except requests.exceptions.ConnectionError:
        print("vllm server is not running")
        return False

try:
    # Monitor the process
    while True:
        if check_vllm_status() == True:
            print("The vllm server is ready to serve.")
            break
        else:
            print("The vllm server has stopped.")
            stdout, stderr = vllm_process.communicate(timeout=10)
            print(f"STDOUT: {stdout.decode('utf-8')}")
            print(f"STDERR: {stderr.decode('utf-8')}")
            break
        time.sleep(5)  # Check every second
except KeyboardInterrupt:
    print("Stopping the check of vllm...")

vllm server is running
The vllm server is ready to serve.


# Create functions for interacting with the model

In [None]:
import requests
import json
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from fastapi.responses import StreamingResponse
import requests

# Request schema for input
class QuestionRequest(BaseModel):
    question: str
    model: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Default model


def ask_model(question: str, model: str):
    """
    Sends a request to the model server and fetches a response.
    """
    url = "http://localhost:8000/v1/chat/completions"  # Adjust the URL if different
    headers = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": question
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    response.raise_for_status()  # Raise exception for HTTP errors
    return response.json()

def stream_llm_response(question:str, model:str):
    url = "http://localhost:8000/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "messages": [{"role": "user", "content": question}],
        "stream": True  # ðŸ”¥ Enable streaming
    }

    with requests.post(url, headers=headers, json=data, stream=True) as response:
        for line in response.iter_lines():
            if line:
                # OpenAI-style streaming responses are prefixed with "data: "
                decoded_line = line.decode("utf-8").replace("data: ", "")
                yield decoded_line + "\n"

{
  "id": "chatcmpl-38d1c99084e4493284636e71cb372cc1",
  "object": "chat.completion",
  "created": 1762304565,
  "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Okay, so I need to figure out the capital of France. I remember that France is a country in Europe, but I'm not exactly sure where its capital is. I think it's somewhere in the north, maybe in France itself? Or is it a city in another country? I've heard terms like \"Paris\" before, but I think that's the capital of another country. Let me think... Paris is the capital of France, right? So maybe the user is confused because they heard of Paris as the capital. But the question is about the capital of France, so that's a bit conflicting.\n\nWait, no, the capital of France is Paris. So if someone is asking about the capital of France, the answer is Paris. But if they're referring to Paris as the capital, then they might

In [None]:
# Test Usage:
result = ask_model("What is the capital of France?", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
print(json.dumps(result, indent=2))

# Create FastAPI Application Endpoints

In [None]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import getpass

from pyngrok import ngrok, conf

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

@app.get('/')
async def root():
    return {'hello': 'world'}
@app.post("/api/v1/generate-response")
def generate_response(request: QuestionRequest):
    """
    API endpoint to generate a response from the model.
    """
    try:
        response = ask_model(request.question, request.model)
        return {"response": response}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/api/v1/generate-response-stream")
def stream_response(request:QuestionRequest):
  try:
    response = stream_llm_response(request.question, request.model)
    return StreamingResponse(response, media_type="text/event-stream")
  except Exception as e:
    raise HTTPException(status_code=500, detail=str(e))

# Ngrok auth

In [None]:
! ngrok config add-authtoken 352HB2LQgN4fwVQa2CXzshQhjBE_6DYYVWt5TXeVkaj3kjHYq

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


# Run ngrok reverse proxy, and uvicorn service

In [None]:
port = 8081
# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(port).public_url
print(f" * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"")

 * ngrok tunnel "https://horary-vivacious-miah.ngrok-free.dev" -> "http://127.0.0.1:8081"


In [None]:
nest_asyncio.apply()

In [None]:
import threading

def run_uvicorn():
    uvicorn.run(app, port=port)

thread = threading.Thread(target=run_uvicorn)
thread.start()

# Example CURL requests

## Streaming



In [None]:
%%bash
curl --location 'https://horary-vivacious-miah.ngrok-free.dev/api/v1/generate-response-stream' \
--header 'Content-Type: application/json' \
--data '{
    "question": "where is paris?",
    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
}'

INFO:     34.142.188.160:0 - "POST /api/v1/generate-response-stream HTTP/1.1" 200 OK
{"id":"chatcmpl-5d3ce83acdd34cff95ebdc34342db5bd","object":"chat.completion.chunk","created":1762305307,"model":"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}],"prompt_token_ids":null}
{"id":"chatcmpl-5d3ce83acdd34cff95ebdc34342db5bd","object":"chat.completion.chunk","created":1762305307,"model":"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B","choices":[{"index":0,"delta":{"content":"Okay"},"logprobs":null,"finish_reason":null,"token_ids":null}]}
{"id":"chatcmpl-5d3ce83acdd34cff95ebdc34342db5bd","object":"chat.completion.chunk","created":1762305307,"model":"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B","choices":[{"index":0,"delta":{"content":","},"logprobs":null,"finish_reason":null,"token_ids":null}]}
{"id":"chatcmpl-5d3ce83acdd34cff95ebdc34342db5bd","object":"chat.completion.chunk","created":1762305307,"model

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    95    0     0  100    95      0    457 --:--:-- --:--:-- --:--:--   456100 19780    0 19685  100    95  16892     81  0:00:01  0:00:01 --:--:-- 16963100 65086    0 64991  100    95  30019     43  0:00:02  0:00:02 --:--:-- 30062100  108k    0  108k  100    95  35182     30  0:00:03  0:00:03 --:--:-- 35203100  154k    0  154k  100    95  37914     22  0:00:04  0:00:04 --:--:-- 37935100  199k    0  199k  100    95  39595     18  0:00:05  0:00:05 --:--:-- 41250100  205k    0  205k  100    95  39765     17  0:00:05  0:00:05 --:--:-- 46211


## Total Response

In [None]:
%%bash
curl --location 'https://horary-vivacious-miah.ngrok-free.dev/api/v1/generate-response' \
--header 'Content-Type: application/json' \
--data '{
    "question": "What is the best food around Columbia University?",
    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
}'

INFO:     34.142.188.160:0 - "POST /api/v1/generate-response HTTP/1.1" 200 OK
{"response":{"id":"chatcmpl-5508caf91fb641aeaa1903e64bb2aa87","object":"chat.completion","created":1762305439,"model":"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B","choices":[{"index":0,"message":{"role":"assistant","content":"Okay, so I need to figure out the best food around Columbia University. I'm not super familiar with the area, but I'll try to break it down step by step.\n\nFirst, I know that Columbia is a big city, and it's in the Upper West Side. I've heard the University Center is a major food hub. So, maybe some of the top spots there are the main places to go for food.\n\nI remember hearing about Joe's in the University Center. It's a pretty famous spot with a lot of people, especially students. It's known for its quick service and good food. I think they have a variety of subs, like pizza, pasta, and sandwiches. That sounds like a good option.\n\nThen there's Joe's French. I think that's a place yo

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   129    0     0  100   129      0    106  0:00:01  0:00:01 --:--:--   106100   129    0     0  100   129      0     58  0:00:02  0:00:02 --:--:--    58100   129    0     0  100   129      0     40  0:00:03  0:00:03 --:--:--    40100   129    0     0  100   129      0     30  0:00:04  0:00:04 --:--:--    30100   129    0     0  100   129      0     24  0:00:05  0:00:05 --:--:--    25100   129    0     0  100   129      0     20  0:00:06  0:00:06 --:--:--     0100  5728  100  5599  100   129    807     18  0:00:07  0:00:06  0:00:01  1185


# Kill vLLM

In [None]:
vllm_process.terminate()
vllm_process.wait()  # Wait for process to terminate

0