In [1]:
import ray

import os
import time
from dotenv import load_dotenv

In [2]:
load_dotenv()

RAY_CLIENT_URL = os.getenv('ray_client_url')

In [3]:
#python versions must match on client and server: 3.9.15
ray.init(
    address=RAY_CLIENT_URL,
    namespace="kuberay",
    runtime_env={
        "pip": [
            "accelerate>=0.16.0",
            "transformers>=4.26.0",
            "numpy<1.24",  
            "torch",            
        ],
        "env_vars": {
            "HF_HUB_DISABLE_PROGRESS_BARS": "1",
#            commenting this improves inference by 20%
#            "RAY_worker_register_timeout_seconds":"600", 
        }
    },
    _system_config={
#        these settings have no effect on the infernce time
#        "num_heartbeats_timeout":300,
#        "kill_idle_workers_interval_ms":0,
#        "idle_worker_killing_time_threshold_ms":100000000,
#        "kill_idle_workers_of_terminated_job":False,
    }
)

0,1
Python version:,3.9.15
Ray version:,2.5.0
Dashboard:,http://10.1.64.45:8265


In [4]:
ray.data.context.DatasetContext.get_current().use_streaming_executor = False

In [5]:
prompt = (
    "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
    "researchers was the fact that the unicorns spoke perfect English."
)

In [6]:
import ray.data
import pandas as pd

ds = ray.data.from_pandas(pd.DataFrame([prompt], columns=["prompt"]))

[2m[33m(raylet)[0m [2023-06-24 13:27:37,095 I 20792 20792] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1

Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


In [7]:
@ray.remote(num_cpus=10)
class PredictCallable:

    def __init__(self, model_id: str, revision: str = None):
        print('__init__')
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch

        start = time.time()
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
#            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
#           device_map="auto",  # automatically makes use of all GPUs available to the Actor
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model.tie_weights()
        end = time.time()
        print(f'model loaded successfully in: {end-start}s')

    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
        print('__call__')        

        tokenized = self.tokenizer(
            list(batch["prompt"]), return_tensors="pt"
        )
        
        input_ids = tokenized.input_ids.to(self.model.device)
        attention_mask = tokenized.attention_mask.to(self.model.device)

        gen_tokens = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            temperature=0.9,
            max_length=100,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        return pd.DataFrame(
            self.tokenizer.batch_decode(gen_tokens), columns=["responses"]
        )

In [8]:
actor = PredictCallable.remote(model_id="EleutherAI/gpt-j-6B", revision = "float16") 

In [9]:
future = actor.__call__.remote(pd.DataFrame([prompt], columns=["prompt"]))

#### Load model time: 23s

-  1 cpus - 230 s / 182 s / model - 42 s
-  5 cpus - 111 s
-  8 cpus - 111s / 65 s 
-  9 cpus - 91 s / 54 s
- 10 cpus - 89 s / 54 s
- 15 cpus - 89 s
- 25 cpus - 89 s 


In [10]:
start = time.time()
gen = ray.get(future)
end = time.time()
print(f'inference time: {end-start}s')

[2m[33m(raylet, ip=10.1.232.31)[0m [2023-06-24 13:27:38,528 I 6478 6478] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1


[2m[36m(PredictCallable pid=6478, ip=10.1.232.31)[0m __init__
[2m[36m(PredictCallable pid=6478, ip=10.1.232.31)[0m model loaded successfully in: 21.541340827941895s
[2m[36m(PredictCallable pid=6478, ip=10.1.232.31)[0m __call__
inference time: 72.03734827041626s


In [11]:
gen.iloc[0][0]

'In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.\n\n"The most striking thing about the first herd of unicorns we discovered in the Peruvian rainforests is that the animals speak English, and no one knew that unicorns lived in the U.S. until that moment,” said Dr. Frank Z'

In [12]:
future = actor.__call__.remote(pd.DataFrame([prompt], columns=["prompt"]))
start = time.time()
gen = ray.get(future)
end = time.time()
print(f'inference time: {end-start}s')
gen.iloc[0][0]

[2m[36m(PredictCallable pid=6478, ip=10.1.232.31)[0m __call__
inference time: 37.454341411590576s


'In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.\n\nThe valley is in the Puna de Atacama region of Chile, an area noted for its high atmospheric salt content, the researchers said, which makes it a perfect climate for the plants and animals in the area.\n\nThis particular valley was known to have'