In [1]:
from rayllm_batch.workload import ChatWorkloadBase
from typing import Optional, Dict, Any
import ray 
from ray.data.dataset import Dataset
from dataclasses import dataclass, field


@dataclass
class CNNDailySummary(ChatWorkloadBase):
    """The CNN/DailyMail summarization workload."""

    # We directly load the dataset from Hugging Face.
    dataset_file: Optional[str] = None
    # We will load only a portion of the dataset to run inference faster for the tutorial.
    dataset_fraction: float = 0.0005 # 0.2% of the 300K entries.
    # The sampling params for the LLM inference workload.
    sampling_params: Dict[str, Any] = field(default_factory=lambda: {"max_tokens": 200})

    def load_dataset(self) -> Dataset:
        # Load the dataset from Hugging Face into Ray Data.
        import datasets  # type: ignore

        df = datasets.load_dataset("cnn_dailymail", "3.0.0")
        return ray.data.from_huggingface(df["train"])

    def parse_row(self, row: dict[str, Any]) -> dict[str, Any]:
        # Parse the row into the format expected by the model.
        # We will use the article as the user prompt, and ask the model to 
        # generate a summary with the system prompt.
        return {
            "messages": [
                {
                    "role": "system",
                    "content": "You are a commentator. Your task is to "
                    "summarize highlights from article.",
                },
                {
                    "role": "user",
                    "content": f"# Article:\n{row['article']}\n\n"
                    "#Instructions:\nIn clear and concise language, "
                    "summarize the highlights presented in the article.",
                },
            ]
        }

In [2]:
# Prompts the user for Hugging Face token if required by the model.
from util.utils import prompt_for_hugging_face_token
HF_TOKEN = prompt_for_hugging_face_token("meta-llama/Meta-Llama-3.1-8B-Instruct")

Successfully read cached token at huggingface_token.txt.


In [3]:
from rayllm_batch import init_engine_from_config
# Read the model configs from the path.
model_config_path = "configs/llama-3.1-8b-a10g.yaml"

# One could potentially override the engine configs by passing in a dictionary here.
override = {"runtime_env": {"env_vars": {"HF_TOKEN": HF_TOKEN}}} # Override Ray's runtime env to include the Hugging Face token. Ray is being used under the hood to orchestrate the inference pipeline.
engine_config = init_engine_from_config(config=model_config_path, override=override)


In [4]:
from rayllm_batch import RayLLMBatch


workload = CNNDailySummary()
batch = RayLLMBatch(
    engine_cfg=engine_config,
    workload=workload,
    # Specify the batch size for inference. Set the batch size to as large as possible without running out of memory.
    # If you encounter out-of-memory errors, decreasing batch_size may help. 
    batch_size=None,
    # Set the number of replicas to use for the inference. Each replica will run one instance of inference pipeline.
    num_replicas=1,
)


# This will runs until completion.
ds = batch.run()


# Read the results
gen_texts = [r["generated_text"] for r in ds.take_all()]
print(gen_texts)

2024-09-24 00:44:00,222	INFO worker.py:1601 -- Connecting to existing Ray cluster at address: 10.0.27.195:6379...
2024-09-24 00:44:00,230	INFO worker.py:1777 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-5abw5nxh2chqdlhxiklnw6cu1k.i.anyscaleuserdata.com [39m[22m
2024-09-24 00:44:00,233	INFO packaging.py:359 -- Pushing file package 'gcs://_ray_pkg_0fa039546672a8668c01da562dca2994d8a900a9.zip' (0.65MiB) to Ray cluster...
2024-09-24 00:44:00,240	INFO packaging.py:372 -- Successfully pushed file package 'gcs://_ray_pkg_0fa039546672a8668c01da562dca2994d8a900a9.zip'.


Parquet Files Sample 0:   0%|          | 0.00/2.00 [00:00<?, ? file/s]

2024-09-24 00:44:05,980	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:05,981	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(random_sample)->Map(CNNDailySummary.parse_row)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- ReadParquet->SplitBlocks(11) 1: 0.00 row [00:00, ? row/s]

- MapBatches(random_sample)->Map(CNNDailySummary.parse_row) 2: 0.00 row [00:00, ? row/s]

- limit=1 3: 0.00 row [00:00, ? row/s]

  ds = self.workload.load_dataset_and_process()
2024-09-24 00:44:13,738	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:13,739	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(random_sample)->Map(CNNDailySummary.parse_row)] -> AllToAllOperator[Repartition] -> ActorPoolMapOperator[MapBatches(ChatTemplateTokenizer)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadParquet->SplitBlocks(11) 1: 0.00 row [00:00, ? row/s]

- MapBatches(random_sample)->Map(CNNDailySummary.parse_row) 2: 0.00 row [00:00, ? row/s]

- Repartition 3: 0.00 row [00:00, ? row/s]

Split Repartition 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- MapBatches(ChatTemplateTokenizer) 5: 0.00 row [00:00, ? row/s]

2024-09-24 00:44:21,002	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:21,003	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Repartition]


Running 0: 0.00 row [00:00, ? row/s]

- Repartition 1: 0.00 row [00:00, ? row/s]

Split Repartition 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

2024-09-24 00:44:21,062	INFO dataset.py:2416 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2024-09-24 00:44:21,064	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:21,065	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Aggregate 1: 0.00 row [00:00, ? row/s]

Sort Sample 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 5: 0.00 row [00:00, ? row/s]

2024-09-24 00:44:21,145	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:21,146	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Aggregate 1: 0.00 row [00:00, ? row/s]

Sort Sample 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 5: 0.00 row [00:00, ? row/s]

2024-09-24 00:44:21,211	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:21,212	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Aggregate 1: 0.00 row [00:00, ? row/s]

Sort Sample 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 5: 0.00 row [00:00, ? row/s]

2024-09-24 00:44:21,284	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:21,285	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Aggregate 1: 0.00 row [00:00, ? row/s]

Sort Sample 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 5: 0.00 row [00:00, ? row/s]

#Requests: 129 (1 partitions), Avg Prompt Tokens: 870.75, Max Prompt Tokens: 2180
2024-09-24 00:44:21,354	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:44:21,354	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(AsyncLLMPredictor)] -> ActorPoolMapOperator[MapBatches(Detokenizer)]


Running 0: 0.00 row [00:00, ? row/s]

[36m(_MapWorker pid=29524)[0m GPU memory used (GB): 0=0.50
[36m(_MapWorker pid=29524)[0m Done waiting for free GPU memory on devices [0] (0.98 GB) 0.01 s
[36m(_MapWorker pid=29524)[0m Max pending requests is set to 71


[36m(_MapWorker pid=29524)[0m INFO 09-24 00:44:25 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post1) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2380, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.39it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.19it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.57it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.47it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.43it/s]
[36m(_MapWorker pid=29524)[0m 


[36m(_MapWorker pid=29524)[0m INFO 09-24 00:44:30 model_runner.py:1008] Loading model weights took 14.9888 GB
[36m(_MapWorker pid=29524)[0m INFO 09-24 00:44:31 gpu_executor.py:122] # GPU blocks: 2170, # CPU blocks: 2048
[36m(_MapWorker pid=29524)[0m INFO 09-24 00:44:34 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
[36m(_MapWorker pid=29524)[0m INFO 09-24 00:44:34 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
[36m(_MapWorker pid=29524)[0m INFO 09-24 00:44:39 model_runner.py:1428] Graph capturing finished in 4 secs.


- MapBatches(AsyncLLMPredictor) 1: 0.00 row [00:00, ? row/s]

- MapBatches(Detokenizer) 2: 0.00 row [00:00, ? row/s]

[36m(MapWorker(MapBatches(AsyncLLMPredictor)) pid=29524)[0m INFO 09-24 00:44:44 metrics.py:351] Avg prompt throughput: 764.3 tokens/s, Avg generation throughput: 1.0 tokens/s, Running: 7 reqs, Swapped: 0 reqs, Pending: 64 reqs, GPU KV cache usage: 17.1%, CPU KV cache usage: 0.0%.
[36m(MapWorker(MapBatches(AsyncLLMPredictor)) pid=29524)[0m INFO 09-24 00:44:49 metrics.py:351] Avg prompt throughput: 4107.7 tokens/s, Avg generation throughput: 5.1 tokens/s, Running: 35 reqs, Swapped: 0 reqs, Pending: 36 reqs, GPU KV cache usage: 80.8%, CPU KV cache usage: 0.0%.
[36m(MapWorker(MapBatches(AsyncLLMPredictor)) pid=29524)[0m INFO 09-24 00:44:54 metrics.py:351] Avg prompt throughput: 1490.4 tokens/s, Avg generation throughput: 534.3 tokens/s, Running: 40 reqs, Swapped: 0 reqs, Pending: 31 reqs, GPU KV cache usage: 98.7%, CPU KV cache usage: 0.0%.
[36m(MapWorker(MapBatches(AsyncLLMPredictor)) pid=29524)[0m INFO 09-24 00:44:59 metrics.py:351] Avg prompt throughput: 564.1 tokens/s, Avg gene

[36m(MapWorker(MapBatches(AsyncLLMPredictor)) pid=29524)[0m Elapsed time for batch a1a53c42d81448b2be2e964a87a3a6c1 with size 129: 67.55436357899987
Total elapsed time: 111.13s
2024-09-24 00:45:51,385	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:45:51,386	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Aggregate 1: 0.00 row [00:00, ? row/s]

Sort Sample 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 5: 0.00 row [00:00, ? row/s]

2024-09-24 00:45:51,451	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:45:51,452	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Aggregate 1: 0.00 row [00:00, ? row/s]

Sort Sample 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 5: 0.00 row [00:00, ? row/s]

Total tokens processed: 134842
Engine throughput (tokens/s): 1993.59
Projected 1M token time (mins): 9.08
2024-09-24 00:45:51,517	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-24_00-28-42_051202_2516/logs/ray-data
2024-09-24 00:45:51,518	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- limit=1 1: 0.00 row [00:00, ? row/s]

"In Kabul, Afghanistan, over 1,000 bottles of alcohol were destroyed as part of a crackdown on smuggling and sales. The bottles were confiscated from Afghan sources over a two-year period, mostly from international hotels and handed over to customs officials who burnt them. This movemade facilitates enforcement of Afghanistan's ban on alcohol due to its largely Muslim population, with only a few specifically designated areas allowing its sale for foreigners."

[36m(autoscaler +4m31s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.


In [7]:
gen_texts = [r["generated_text"] for r in ds.take_all()]

In [8]:
gen_texts

["In Kabul, Afghanistan, over 1,000 bottles of alcohol were destroyed as part of a crackdown on smuggling and sales. The bottles were confiscated from Afghan sources over a two-year period, mostly from international hotels and handed over to customs officials who burnt them. This movemade facilitates enforcement of Afghanistan's ban on alcohol due to its largely Muslim population, with only a few specifically designated areas allowing its sale for foreigners.",
 'Here are the article\'s highlights summarized:\n\nPhotographer Eiko Jones was taking underwater pictures of plants near Campbell River in Canada when he was suddenly surrounded by thousands of tadpoles swimming at the Cedar Lake. The 41-year-old photographer was initially startled by the large group of tadpoles, which appeared as a "cloud" swimming around him for about ten minutes. The rare sighting was a unique experience for Jones, who stated that he had never seen or heard of tadpoles schooling before.',
 'Two high-ranking 