<a href="https://colab.research.google.com/drive/1_8CWyOEpMH0eQzY9l1wc1OevTJf0VVXb?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Environment Setup
!pip install openllm[llama] bentoml vllm accelerate bitsandbytes --upgrade -q
!pip install "openllm[gptq]" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ -q

In [None]:
#@title [optional] Check the memory, and gpu info you have
import psutil
import torch

ram = psutil.virtual_memory()
ram_total = ram.total / (1024 ** 3)
print("MemTotal: %.2f GB" % ram_total)

print("=============GPU INFO=============")
if torch.cuda.is_available():
    !/opt/bin/nvidia-smi || ture
else:
    print("GPU NOT available")
    #print("RUN `openllm models` to find modles which can runable on CPU")

In [None]:
#@title [Optional] start the llama2 server locally using openllm
#RUN `openllm build -h` for help
import sys

#run `openllm models` to find more model IDs of llama2
MODEL_ID = "TheBloke/Llama-2-13B-chat-GPTQ" #@param ["TheBloke/Llama-2-7b-Chat-GPTQ", "TheBloke/Llama-2-13B-chat-GPTQ", "TheBloke/Llama-2-70B-chat-GPTQ"]

if 'google.colab' in sys.modules:
  #using colab proxy URL
  from google.colab.output import eval_js
  print("you are in colab runtime. please try it out in %s" % eval_js("google.colab.kernel.proxyPort(8001)"))

! openllm start llama --model-id {MODEL_ID} --backend pt  --quantize gptq --port 8001

In [None]:
#@title Build bentos locally using Openllm
#RUN `openllm build -h` for help
!openllm build llama --model-id TheBloke/Llama-2-13B-chat-GPTQ --backend pt --quantize gptq

In [None]:
#@title Check the bentos you just build, and push them to bentocloud
!bentoml list -o json

return_code = !bentoml cloud list-context

if "colab-user" not in ''.join(return_code):
  #login bentocloud
  endpoint = input("input endpoint (like https://xxx.cloud.bentoml.com): ")
  token = input("input token (please follow https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html#creating-an-api-token):")
  !bentoml cloud login --api-token {token} --endpoint {endpoint} --context colab-user

#change to your own bentos tag
!bentoml push thebloke--llama-2-13b-chat-gptq-service:ec124ec7c8f14b67b0808b870b08497ce27634fa --context colab-user

In [None]:
#@title Follow the [guide](https://www.bentoml.com/blog/deploying-llama-2-7b-on-bentocloud) to deploy this llama model on bentocloud

In [None]:
!bentoml list -o json

[1m[[0m
  [1m{[0m
    [32m"tag"[0m: 
[32m"thebloke--llama-2-13b-chat-gptq-service:ec124ec7c8f14b67b0808b870b08497ce27634f[0m
[32ma"[0m,
    [32m"size"[0m: [32m"34.48 KiB"[0m,
    [32m"creation_time"[0m: [32m"2023-09-20 09:58:35"[0m
  [1m}[0m
[1m][0m


In [None]:
#@title Or use bentoml client to start a deployment
import bentoml
import json

return_code = !bentoml cloud list-context
if "colab-user" not in ''.join(return_code):
  print("please login first!")
else:
  client = bentoml.cloud.BentoCloudClient()
  #detailed configuration in https://docs.bentoml.com/en/latest/bentocloud/reference/deployment-creation-and-update-info.html
  #runner config
  runner = bentoml.cloud.Resource.for_runner(
      resource_instance="starter-aws-g4dn-xlarge-gpu-t4-xlarge",
      hpa_conf={"min_replicas": 1, "max_replicas": 1},
  )
  #api-server hpa config
  api_server = bentoml.cloud.Resource.for_api_server(
      resource_instance="starter-aws-t3-2xlarge-cpu-small",
  )
  hpa_conf = bentoml.cloud.Resource.for_hpa_conf(min_replicas=1, max_replicas=1)

  res = client.deployment.create(
      deployment_name="test-thebloke",
      bento="thebloke--llama-2-13b-chat-gptq-service:ec124ec7c8f14b67b0808b870b08497ce27634fa",
      context = "colab-user",
      cluster_name = "default",
      #mode="function",
      kube_namespace='yatai',
      runners_config={"llm-llama-runner": runner},
      api_server_config=api_server,
      hpa_conf=hpa_conf,
  )
  print(json.dump(res, indent=4))