<a href="https://colab.research.google.com/github/bentoml/OpenLLM/blob/feat%2Fllama-example/example/llama/openllm_llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Environment Setup
!pip install openllm[llama] bentoml vllm --upgrade >/dev/null 2>&1

In [None]:
#@title [optional] Check the memory, and gpu info you have
import psutil
import torch

ram = psutil.virtual_memory()
ram_total = ram.total / (1024 ** 3)
print("MemTotal: %.2f GB" % ram_total)

print("=============GPU INFO=============")
if torch.cuda.is_available():
    !/opt/bin/nvidia-smi || ture
else:
    print("GPU NOT available")
    #print("RUN `openllm models` to find modles which can runable on CPU")

MemTotal: 12.68 GB
GPU NOT available


In [None]:
#@title Define the llama service, modify this if you want to customize
%%file service.py
import bentoml
import openllm
import openllm_core
import os
import typing as t

#run `openllm models` to find more model IDs of llama2
MODEL_ID = "NousResearch/llama-2-7b-chat-hf"  #@param ["NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-13b-chat-hf","NousResearch/llama-2-70b-chat-hf"]
BACKEND = "pt"  #@param ["pt", "vllm"]


os.environ['OPENLLM_MODEL_ID'] = MODEL_ID
os.environ['OPENLLM_BACKEND'] = BACKEND

model = "llama"

llm_config = openllm.AutoConfig.for_model(model)
llm_runner = openllm.Runner(model, llm_config=llm_config)

svc = bentoml.Service(name="llama-service", runners=[llm_runner])

_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})


@svc.on_startup
def download(_: bentoml.Context):
  llm_runner.download_model()

@svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
  config = qa_inputs.llm_config.model_dump()
  if llm_runner.backend == 'vllm':
    responses = await llm_runner.vllm_generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, request_id=openllm_core.utils.gen_random_uuid(), **config)
  else:
    responses = await llm_runner.generate.async_run(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **config)
  return openllm.GenerationOutput(responses=responses, configuration=config)

@svc.api(route='/v1/generate_stream', input=_JsonInput, output=bentoml.io.Text(content_type='text/event-stream'))
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
  echo = input_dict.pop('echo', False)
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
  if llm_runner.backend == 'vllm':
    return llm_runner.vllm_generate_iterator.async_stream(qa_inputs.prompt,
                                                      adapter_name=qa_inputs.adapter_name,
                                                      echo=echo,
                                                      request_id=openllm_core.utils.gen_random_uuid(),
                                                      **qa_inputs.llm_config.model_dump())
  else:
    return llm_runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())


Writing service.py


In [None]:
#@title Define the bentofile.yaml (modify it following https://docs.bentoml.com/en/latest/concepts/bento.html)
%%file bentofile.yaml
service: 'service:svc'
include:
  - '*.py'
python:
  packages:
    - openllm

Writing bentofile.yaml


In [None]:
#@title Build the llama bento using bentoml (service.py bentofile.yaml)
!bentoml build -f bentofile.yaml

In [None]:
#@title Or build bentos using Openllm
#RUN `openllm build -h` for help
!openllm build llama --model-id NousResearch/llama-2-7b-chat-hf --backend pt

In [None]:
#@title Check the bentos you just build, and push them to bentocloud if you want
! bentoml list
endpoint = input("input endpoint (like https://xxx.cloud.bentoml.com): ")
token = input("input token (please follow https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#creating-an-api-token):")

#!bentoml cloud login --api-token {token} --endpoint {endpoint} --context colab-user
!bentoml push xxxx    --context colab-user

[1m [0m[1mTag                                           [0m[1m [0m[1m [0m[1mSize     [0m[1m [0m[1m [0m[1mCreation Time      [0m[1m [0m
 llama-service:lqwm2tctp2d3masc                  16.42 KiB  2023-09-15 04:17:08 
 nousresearch--llama-2-7b-hf-service:dacdfcde3…  34.43 KiB  2023-09-15 03:39:54 
[
  "colab-user"
]


In [None]:
#@title [optional] Start the llama server locally using `bentoml` command
from google.colab.output import eval_js
print("try it out in %s" % eval_js("google.colab.kernel.proxyPort(3000)"))
!bentoml serve service:svc -q

try it out in https://dssd5iaijrq-496ff2e9c6d22116-3000-colab.googleusercontent.com/
