<a href="https://colab.research.google.com/github/aknip/Modal/blob/main/vllm_llama-2-13b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VLLM_Llama-2-13b

- REST API via FAST-API

The app then can be automatically deployed to Modal.com.
- Export Notebook to .py-file automatically (via nbdev)
- Serve or deploy .py-file to Modal.com automatically

Code blocks wich are needed for the final .py-file (for Modal.com) are marked with `#|export`

Sources:  
- https://github.com/jxnl/fastllm/blob/main/applications/vllm-struct/main.py


In [1]:
import psutil
IN_NOTEBOOK = any(["jupyter-notebook" in i for i in psutil.Process().parent().cmdline()])

In [2]:
%%capture --no-stderr
!pip install modal nbdev -q

In [3]:
import json
import os
from getpass import getpass
if IN_NOTEBOOK:
  CREDS = json.loads(getpass("Secrets (JSON string): "))
  os.environ['CREDS'] = json.dumps(CREDS)
  CREDS = json.loads(os.getenv('CREDS'))

Secrets (JSON string): ··········


In [4]:
import os
os.environ["MODAL_TOKEN_ID"] = CREDS['MODAL']['MODAL_TOKEN_ID']['credential']
os.environ["MODAL_TOKEN_SECRET"] = CREDS['MODAL']['MODAL_TOKEN_SECRET']['credential']

# The app - runs in Modal

In [8]:
# uncomment following line to export
# %%writefile vllm_llama-2-13b.py

from modal import Stub, Image, Secret, method, asgi_app
from typing import List
import os
import json

import fastapi
from pydantic import BaseModel

app = fastapi.FastAPI(
    title="vLLM",
)


def download_model_to_folder():
    from huggingface_hub import snapshot_download

    #"meta-llama/Llama-2-13b-chat-hf",
    snapshot_download(
        "mistralai/Mistral-7B-Instruct-v0.1",
        local_dir="/model",
        token=os.environ["HUGGINGFACE_TOKEN"],
    )


MODEL_DIR = "/model"

image = (
    Image.from_registry("nvcr.io/nvidia/pytorch:22.12-py3")

    .pip_install("torch==2.0.1", index_url="https://download.pytorch.org/whl/cu118")
    #.pip_install("torch==2.0.1+cu118", index_url="https://download.pytorch.org/whl/cu118")

    # NOT WORKING:
    # Pin vLLM to 07/19/2023
    #.pip_install(
    #    "vllm @ git+https://github.com/vllm-project/vllm.git@bda41c70ddb124134935a90a0d51304d2ac035e8"
    #)

    # Pinned to 10/16/23
    .pip_install(
        "vllm @ git+https://github.com/vllm-project/vllm.git@651c614aa43e497a2e2aab473493ba295201ab20"
    )
    .pip_install("hf-transfer~=0.1")
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .run_function(download_model_to_folder, secret=Secret.from_name("huggingface"))
)

stub = Stub("vllm", image=image)


# ## The model class
#
# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions) and the `__enter__` method.
# This enables us to load the model into memory just once every time a container starts up, and keep it cached
# on the GPU for each subsequent invocation of the function.
#
# The `vLLM` library allows the code to remain quite clean.
@stub.cls(gpu="A100", secret=Secret.from_name("huggingface"))
class Model:
    def __enter__(self):
        from vllm import LLM

        # Load the model. Tip: MPT models may require `trust_remote_code=true`.
        # We also add additional system prompting to the model to help it output json correctly.
        self.llm = LLM(MODEL_DIR)
        self.template = """SYSTEM: Always correctly output response data as correctly formatted json in a codeblock\n{system}
USER: {input}
ASSISTANT: ```json\n"""

    @method()
    def generate(
        self,
        system: str,
        inputs: List[str],
        max_tokens: int = 800,
        temperature: float = 0.1,
        presence_penalty: float = 1.15,
    ):
        from vllm import SamplingParams

        prompts = [self.template.format(system=system, input=ii) for ii in inputs]
        sampling_params = SamplingParams(
            temperature=temperature,
            # we add a ``` to the end of the prompt to ensure the model outputs a codeblock
            # improving the chances of it outputting correctly formatted json
            stop="```",
            top_p=1,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
        )
        result = self.llm.generate(prompts, sampling_params)
        num_tokens = 0
        results = [output.outputs[0].text for output in result]
        num_tokens = sum([len(output.outputs[0].token_ids) for output in result])
        return results, num_tokens


class InputModel(BaseModel):
    system: str
    data: List[str]
    max_tokens: int = 800
    temperature: float = 0.1
    presence_penalty: float = 1.15


@app.post("/")
def main(input: InputModel):
    def try_json(x):
        try:
            return json.loads(x)
        except Exception as e:
            print(e)
            return x

    model = Model()
    #data, num_tokens = model.generate.call(
    data, num_tokens = model.generate.remote(
        system=input.system,
        inputs=input.data,
        max_tokens=input.max_tokens,
        temperature=input.temperature,
        presence_penalty=input.presence_penalty,
    )
    return {
        "data": [try_json(x) for x in data],
        "num_tokens": num_tokens,
    }


@stub.function(image=image)
@asgi_app()
def fastapi_app():
    return app

Overwriting vllm_llama-2-13b.py


# Run in Modal

- might throws errors, eg. during some packages
- solution: run again

In [None]:
# Serve
!modal serve vllm_llama-2-13b.py

# Check URL in Browser
https://aknip--vllm-fastapi-app-dev.modal.run/docs

# Send POST via Terminal
```
curl -X 'POST' \
  'https://aknip--vllm-fastapi-app-dev.modal.run' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
   "system":"Extract Users from `Interface Users { users: Array<{name: string, age:number}>}`",
   "data":[
      "James, 33, and Isabella, 23, are among the users with Benjamin, who is 34, Mia, 30, and Ethan, 28.",
      "Evelyn, 25, and Jacob, 29, are part of the records, along with Abigail, 27, Liam, 32, and Harper, 26."
   ]
}'
``

In [None]:
# Local run
# 1st run / initial run:
# 2nd run:
# 3rd run:
!modal run vllm_llama-2-13b.py

In [None]:
# Deploy server permanently
!modal deploy vllm_llama-2-13b.py