<a href="https://colab.research.google.com/drive/158zwSM__zs0caehysLinxLkjY7_naqcK?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Environment Setup
!pip install openllm[llama] bentoml vllm --upgrade >/dev/null 2>&1

In [2]:
#@title [optional] Check the memory, and gpu info you have
import psutil
import torch

ram = psutil.virtual_memory()
ram_total = ram.total / (1024 ** 3)
print("MemTotal: %.2f GB" % ram_total)

print("=============GPU INFO=============")
if torch.cuda.is_available():
    !/opt/bin/nvidia-smi || ture
else:
    print("GPU NOT available")
    #print("RUN `openllm models` to find modles which can runable on CPU")

MemTotal: 12.68 GB
Sat Sep 16 03:15:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8    10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------------------------------------------------

In [4]:
#@title Define the llama service, modify this if you want to customize
%%file service.py
import bentoml
import openllm
import openllm_core
import os
import typing as t

#run `openllm models` to find more model IDs of llama2
MODEL_ID = "NousResearch/llama-2-7b-chat-hf"  #@param ["NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-13b-chat-hf","NousResearch/llama-2-70b-chat-hf"]
BACKEND = "vllm"  #@param ["pt", "vllm"]


os.environ['OPENLLM_MODEL_ID'] = MODEL_ID
os.environ['OPENLLM_BACKEND'] = BACKEND

model = "llama"

llm_config = openllm.AutoConfig.for_model(model)
llm_runner = openllm.Runner(model, llm_config=llm_config)

svc = bentoml.Service(name="llama-service", runners=[llm_runner])

_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})


@svc.on_startup
def download(_: bentoml.Context):
  llm_runner.download_model()

@svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
  config = qa_inputs.llm_config.model_dump()
  if llm_runner.backend == 'vllm':
    responses = await llm_runner.vllm_generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, request_id=openllm_core.utils.gen_random_uuid(), **config)
  else:
    responses = await llm_runner.generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **config)
  return openllm.GenerationOutput(responses=responses, configuration=config)

@svc.api(route='/v1/generate_stream', input=_JsonInput, output=bentoml.io.Text(content_type='text/event-stream'))
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
  echo = input_dict.pop('echo', False)
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
  if llm_runner.backend == 'vllm':
    return llm_runner.vllm_generate_iterator.async_stream(qa_inputs.prompt,
                                                      adapter_name=qa_inputs.adapter_name,
                                                      echo=echo,
                                                      request_id=openllm_core.utils.gen_random_uuid(),
                                                      **qa_inputs.llm_config.model_dump())
  else:
    return llm_runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())


Overwriting service.py


In [5]:
#@title Define the bentofile.yaml (modify it following https://docs.bentoml.com/en/latest/concepts/bento.html)
%%file bentofile.yaml
service: 'service:svc'
include:
  - '*.py'
python:
  packages:
    - openllm

Writing bentofile.yaml


In [6]:
#@title Build the llama bento using bentoml (service.py bentofile.yaml)
!bentoml build -f bentofile.yaml

Downloading (…)lve/main/config.json: 100% 583/583 [00:00<00:00, 3.30MB/s]
Downloading (…)okenizer_config.json: 100% 746/746 [00:00<00:00, 3.94MB/s]
Downloading tokenizer.model: 100% 500k/500k [00:00<00:00, 17.9MB/s]
Downloading (…)/main/tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 6.88MB/s]
Downloading (…)in/added_tokens.json: 100% 21.0/21.0 [00:00<00:00, 95.7kB/s]
Downloading (…)cial_tokens_map.json: 100% 435/435 [00:00<00:00, 2.22MB/s]
Fetching 11 files:   0% 0/11 [00:00<?, ?it/s]
Downloading (…)neration_config.json: 100% 179/179 [00:00<00:00, 835kB/s]
Fetching 11 files:  27% 3/11 [00:00<00:02,  3.59it/s]
Downloading (…)fetensors.index.json: 100% 26.8k/26.8k [00:00<00:00, 82.6MB/s]

Downloading (…)model.bin.index.json: 100% 26.8k/26.8k [00:00<00:00, 80.9MB/s]

Downloading (…)of-00002.safetensors:   0% 0.00/3.50G [00:00<?, ?B/s][A

Downloading (…)of-00002.safetensors:   0% 0.00/9.98G [00:00<?, ?B/s][A[A
Downloading (…)of-00002.safetensors:   1% 21.0M/3.50G [00:00<00:26, 131MB/s]

In [None]:
#@title Or build bentos using Openllm
#RUN `openllm build -h` for help
!openllm build llama --model-id NousResearch/llama-2-7b-chat-hf --backend pt

In [None]:
#@title Check the bentos you just build, and push them to bentocloud if you want
! bentoml list
endpoint = input("input endpoint (like https://xxx.cloud.bentoml.com): ")
token = input("input token (please follow https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#creating-an-api-token):")

#!bentoml cloud login --api-token {token} --endpoint {endpoint} --context colab-user
!bentoml push xxxx    --context colab-user

[1m [0m[1mTag                                           [0m[1m [0m[1m [0m[1mSize     [0m[1m [0m[1m [0m[1mCreation Time      [0m[1m [0m
 llama-service:lqwm2tctp2d3masc                  16.42 KiB  2023-09-15 04:17:08 
 nousresearch--llama-2-7b-hf-service:dacdfcde3…  34.43 KiB  2023-09-15 03:39:54 
[
  "colab-user"
]


In [None]:
#@title [optional] Start the llama server locally using `bentoml` command
from google.colab.output import eval_js
print("try it out in %s" % eval_js("google.colab.kernel.proxyPort(3000)"))
!bentoml serve service:svc

try it out in https://reb1lygcbue-496ff2e9c6d22116-3000-colab.googleusercontent.com/
2023-09-16T03:41:15+0000 [INFO] [cli] Prometheus metrics for HTTP BentoServer from "service:svc" can be accessed at http://localhost:3000/metrics.
2023-09-16T03:41:16+0000 [INFO] [cli] Starting production HTTP BentoServer from "service:svc" listening on http://0.0.0.0:3000 (Press CTRL+C to quit)
INFO 09-16 03:41:28 llm_engine.py:72] Initializing an LLM engine with config: model='/root/bentoml/models/vllm-nousresearch--llama-2-7b-chat-hf/37892f30c23786c0d5367d80481fa0d9fba93cf8', tokenizer='hf-internal-testing/llama-tokenizer', tokenizer_mode=auto, trust_remote_code=False, dtype=torch.float16, download_dir=None, load_format=auto, tensor_parallel_size=1, seed=0)
Downloading (…)okenizer_config.json: 100% 700/700 [00:00<00:00, 4.11MB/s]
Downloading tokenizer.model: 100% 500k/500k [00:00<00:00, 17.6MB/s]
Downloading (…)/main/tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 5.48MB/s]
Downloading (…)cial_tokens