<a href="https://colab.research.google.com/drive/1HOaFA1ogMDPalGw_e55VcfDhiiblzrFO?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup and Install
- It's best to run fLlama on a GPU, which you can do using a free Colab notebook.
- Check the Google Colab runtime to the top right corner.
- Or, go to the menu -> Runtime -> Change Runtime Type.
- Select GPU (T4).

In [None]:
##@title Clone the project and install dependencies
UPDATE_OPENLLM = True  #@param {type:"boolean"}
WORKSPACE = 'OpenLLM'
PROJECT_NAME = 'examples/fLlama-demo'

![ ! -d $WORKSPACE ] && echo -= Initial setup OpenLLM =- && git clone https://github.com/bentoml/OpenLLM.git
%cd $WORKSPACE

if UPDATE_OPENLLM:
  !echo -= Updating openllm =-
  !git fetch origin feat/llama-colab && git checkout feat/llama-colab && git pull

%cd {PROJECT_NAME}

!echo -= Install dependencies =-
![ -f requirements.txt ] && pip install -q -r requirements.txt

In [None]:
#@title [optional] Check the memory, gpu
import psutil
import torch

ram = psutil.virtual_memory()
ram_total = ram.total / (1024 ** 3)
print("MemTotal: %.2f GB" % ram_total)

print("=============GPU INFO=============")
if torch.cuda.is_available():
    !/opt/bin/nvidia-smi || ture
else:
    print("GPU NOT available")
    #print("RUN `openllm models` to find modles which can runable on CPU")

## download model from huggingface
https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2


In [None]:
#@title bentoml runner defined in runner.py, execute download_model to download the model
!python runner.py

In [4]:
#@title check the model we just download
!bentoml models list -o json

[1m[[0m
  [1m{[0m
    [1;34m"tag"[0m: [32m"vllm-trelis--llama-2-7b-chat-hf-function-calling-v2:7013579ab9c47d409e0c9c02a03de4027a65e566"[0m,
    [1;34m"module"[0m: [32m"openllm.serialisation.transformers"[0m,
    [1;34m"size"[0m: [32m"12.55 GiB"[0m,
    [1;34m"creation_time"[0m: [32m"2023-09-26 11:11:16"[0m
  [1m}[0m
[1m][0m


## Prepare the prompt for the fLlama

In [5]:
#@title Build a sample prompt query
from prompt import *
prompt_template(HVAC_FUNCS)("please shut down the AC")

'<FUNCTIONS>{\n    "function": "HVAC_CONTROL",\n    "description": "Call an API to adjust the AC setting in the car.",\n    "arguments": [\n        {\n            "name": "action",\n            "description": "The type of action requested, must be one of the following:\\n\'SET_TEMPERATURE\': set, increase, decrease or turn on AC to a desired temperature. Must be used with the temperature argument;\\n\'UP\': increase the temperature from current setting. If a specific temperature is given, use SET_TEMPERATURE instead;\\n\'DOWN\': decrease the temperature from current setting. If a specific temperature is given, use SET_TEMPERATURE instead;\\n\'ON\': turn on the AC;\\n\'OFF\': turn off the AC;\\n            ",\n            "enum": [\n                "ON",\n                "OFF",\n                "UP",\n                "DOWN",\n                "SET_TEMPERATURE"\n            ],\n            "type": "string"\n        },\n        {\n            "name": "temperature",\n            "type": "nu

## [Optional] Start the server

In [None]:
#@title Start the llama server locally using `bentoml` command.
#bentoml serve will use the service.py under this folder to start a llama2 server
import sys
if 'google.colab' in sys.modules:
  #using colab proxy URL
  from google.colab.output import eval_js
  print("you are in colab runtime. please try it out in %s" % eval_js("google.colab.kernel.proxyPort(8001)"))

RUN_IN_BACKGROUND = False #@param {type:"boolean"}
if RUN_IN_BACKGROUND:
  !nohup bentoml serve service:svc -p 8001 &
else:
  !bentoml serve service:svc -p 8001

In [10]:
#@title [Optional] if you run server in background, you can test it in colab env
!curl -X 'POST' \
  'http://127.0.0.1:8001/query' \
  -H 'accept: application/json' \
  -H 'Content-Type: text/plain' \
  -d 'i feel a little bit cold'

Not able to process the request in 60 seconds

## build bento

In [7]:
%%file bentofile.yaml
service: 'service:svc'
include:
  - '*.py'
python:
  packages:
    - openllm[llama,vllm]
    - git+https://github.com/huggingface/accelerate.git

Writing bentofile.yaml


In [None]:
#@title Build the llama bento using bentoml on entofile.yaml
!bentoml build -f bentofile.yaml

In [9]:
#@title Check the bentos you just build
! bentoml list -o json

[1m[[0m
  [1m{[0m
    [32m"tag"[0m: [32m"assistant:nudslos4l632easc"[0m,
    [32m"size"[0m: [32m"19.07 KiB"[0m,
    [32m"creation_time"[0m: [32m"2023-09-26 11:28:36"[0m
  [1m}[0m
[1m][0m


In [None]:
!pip install -U git+https://github.com/bentoml/BentoML.git@fix/push_oom

In [None]:
#@title Push them to bentocloud if you want
! bentoml list -o json
return_code = !bentoml cloud list-context
print(return_code)
if "colab-user" not in ''.join(return_code):
  #login bentocloud
  endpoint = input("input endpoint (like https://xxx.cloud.bentoml.com):")
  token = input("input token (please follow https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#creating-an-api-token):")
  !bentoml cloud login --api-token {token} --endpoint {endpoint} --context colab-user

#change to your own bentos tag
!bentoml push assistant:nudslos4l632easc  --context colab-user

In [None]:
#@title Follow the [guide](https://www.bentoml.com/blog/deploying-llama-2-7b-on-bentocloud) to deploy this llama model on bentocloud

In [None]:
#@title Or use bentoml client sdk to start a deployment
import bentoml
import json

return_code = !bentoml cloud list-context
if "colab-user" not in ''.join(return_code):
  print("please login first!")
else:
  client = bentoml.cloud.BentoCloudClient()
  #detailed configuration in https://docs.bentoml.com/en/latest/bentocloud/reference/deployment-creation-and-update-info.html
  #runner config
  runner = bentoml.cloud.Resource.for_runner(
      resource_instance="starter-aws-g4dn-xlarge-gpu-t4-xlarge",
      hpa_conf={"min_replicas": 1, "max_replicas": 1},
  )
  #api-server hpa config
  api_server = bentoml.cloud.Resource.for_api_server(
      resource_instance="starter-aws-t3-2xlarge-cpu-small",
  )
  hpa_conf = bentoml.cloud.Resource.for_hpa_conf(min_replicas=1, max_replicas=1)

  res = client.deployment.create(
      deployment_name="test-codellama",
      bento="assistant:nudslos4l632easc",
      context = "colab-user",
      cluster_name = "default",
      mode="deployment",
      kube_namespace='yatai',
      runners_config={"llm-llama-runner": runner},
      api_server_config=api_server,
      hpa_conf=hpa_conf,
  )
  print(json.dumps(res, indent=4))
  #!bentoml deployment create -f deployment.json --context colab-user