In [1]:
!nvidia-smi

Wed Jul 12 02:19:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install bitsandbytes==0.39.0
!pip install transformers@git+https://github.com/huggingface/transformers.git
!pip install peft@git+https://github.com/huggingface/peft.git
!pip install accelerate@git+https://github.com/huggingface/accelerate.git
!pip install einops==0.6.1
!pip install sentencepiece==0.1.99
!pip install -q xformers

Collecting bitsandbytes==0.39.0
  Downloading bitsandbytes-0.39.0-py3-none-any.whl (92.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.39.0
Collecting transformers@ git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-install-w2gut_rd/transformers_43234ad0f68146b0a3f3e1e2f0dc172e
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-install-w2gut_rd/transformers_43234ad0f68146b0a3f3e1e2f0dc172e
  Resolved https://github.com/huggingface/transformers.git to commit 45025d92f815675e483f32812caa28cce3a960e7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<

In [4]:
!pip install -q "ray[serve]"
!pip install -q starlette
!pip install -q langchain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
%%writefile test.py
import torch
from transformers import pipeline
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

import ray
from ray import serve

from starlette.requests import Request


template = """Question: {question}

Answer: Let's think step by step."""

PROMPT = PromptTemplate(template=template, input_variables=["question"])

# ray.init(num_gpus=1)

class LocalLLM(HuggingFacePipeline):

  @classmethod
  def from_model_id(cls):
    model_name = "huggyllama/llama-7b"
    adapters_name = 'timdettmers/guanaco-7b'

    model = AutoModelForCausalLM.from_pretrained(
                                            model_name,
                                            load_in_4bit=True,
                                            torch_dtype=torch.bfloat16,
                                            device_map="auto",
                                            max_memory= {i: '24000MB' for i in range(torch.cuda.device_count())},
                                            quantization_config=BitsAndBytesConfig(
                                                load_in_4bit=True,
                                                bnb_4bit_compute_dtype=torch.bfloat16,
                                                bnb_4bit_use_double_quant=True,
                                                bnb_4bit_quant_type='nf4'
                                            ),
                                            )
    model = PeftModel.from_pretrained(model, adapters_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    pipe =  pipeline(
                      "text-generation",
                      model=model,
                      tokenizer=tokenizer,
                      max_new_tokens=200,
                      do_sample = True,
                      top_k = 10,
                      num_return_sequences=1,
                      eos_token_id=tokenizer.eos_token_id,
                      )
    return cls(pipeline=pipe)


@serve.deployment(
    ray_actor_options={"num_gpus": 1},
    autoscaling_config={"min_replicas": 0, "max_replicas": 2},
)
class GPTModel:
    def __init__(self):
        self.llm = LocalLLM.from_model_id()
        self.chain = LLMChain(llm=self.llm, prompt=PROMPT)


    def _run_chain(self, text: str):
        return self.chain(text)

    async def __call__(self, request: Request):
        # 1. Parse the request
        text = request.query_params["text"]
        # 2. Run the chain
        resp = self._run_chain(text)
        # 3. Return the response
        return resp["text"]

deployment = GPTModel.bind()


Writing test.py


In [6]:
!pip install -q  pyngrok

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/681.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m681.2/681.2 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone


In [7]:
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null && echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list && sudo apt update && sudo apt install ngrok

deb https://ngrok-agent.s3.amazonaws.com buster main
[33m0% [Working][0m            Hit:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.38)] [Wa[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu focal InRelease
                                                                               Get:3 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
[33m0% [3 InRelease 15.6 kB/114 kB 14%] [Connecting to security.ubuntu.com (91.189.[0m                                                                               Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
[33m0% [3 InRelease 15.6 kB/114 kB 14%] [Connecting t

In [8]:
!ngrok authtoken "Ngrok_AUTH_TOKEN"

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [9]:
from pyngrok import ngrok
import subprocess

ngrok_tunnel = ngrok.connect(8000)
if ngrok_tunnel:
  print('Public URL:', ngrok_tunnel.public_url)
  command = ["serve", "run", "test:deployment"]
  subprocess.run(command)





Public URL: https://02e8-35-204-244-165.ngrok-free.app


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-2e030a433d0c>", line 8, in <cell line: 5>
    subprocess.run(command)
  File "/usr/lib/python3.10/subprocess.py", line 505, in run
    stdout, stderr = process.communicate(input, timeout=timeout)
  File "/usr/lib/python3.10/subprocess.py", line 1146, in communicate
    self.wait()
  File "/usr/lib/python3.10/subprocess.py", line 1209, in wait
    return self._wait(timeout=timeout)
  File "/usr/lib/python3.10/subprocess.py", line 1959, in _wait
    (pid, sts) = self._try_wait(0)
  File "/usr/lib/python3.10/subprocess.py", line 1917, in _try_wait
    (pid, sts) = os.waitpid(self.pid, wait_flags)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/

TypeError: ignored