In [None]:

%pip install -U anyscale -q
%load_ext autoreload
%autoreload 2

import os
import ray
import datasets
from datasets import DatasetDict, load_dataset
import anyscale
from anyscale.llm.dataset import Dataset as AnyscaleDataset
import yaml
from rich import print
from src.utils import SYSTEM_CONTENT, to_schema, get_dataset_file_path, update_datasets_in_fine_tuning_config

# Initialize HF token
# assert ~/default/.HF_TOKEN exists
assert os.path.exists(os.path.expanduser('~/default/.HF_TOKEN')), (
    'Please create ~/default/.HF_TOKEN with your Hugging Face token\n'
    'echo "your_token" > ~/default/.HF_TOKEN'
)
HF_TOKEN = open(os.path.expanduser('~/default/.HF_TOKEN')).read().strip()

os.environ['HF_TOKEN'] = HF_TOKEN
ray.shutdown()
ray.init(runtime_env={'env_vars': {'HF_TOKEN': HF_TOKEN}})
LLAMA_3_SERVE_CONFIG_PATH = 'deploy/services/model_config/meta-llama--Meta-Llama-3-8B-Instruct.yaml'
config = yaml.safe_load(open(LLAMA_3_SERVE_CONFIG_PATH))
config['runtime_env']['env_vars']['HUGGING_FACE_HUB_TOKEN'] = HF_TOKEN
with open(LLAMA_3_SERVE_CONFIG_PATH, 'w') as f:
    yaml.safe_dump(config, f)


ray.data.DataContext.get_current().enable_progress_bars = False
ray.data.DataContext.get_current().print_on_execution_start = False
datasets.disable_progress_bars()

In [None]:
# Deploy Service
!anyscale service deploy -f deploy/services/serve.yaml

# End-to-end LLM Workflows

In this guide, we'll learn how to run an end-to-end LLM workflow. We separate this into four steps:

1. **Data preprocessing**
2. **Fine-tuning**
3. **Serving**
4. **(Optional) Evaluation**

**Objective**: Have an LLM convert unstructured text inputs about video games into structured text outputs.

## 1. Data Preprocessing

In [None]:
print(SYSTEM_CONTENT)

In [None]:
from src.utils import query

response = query(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    prompt="Dirt: Showdown from 2012 is a sport racing game for the PlayStation, Xbox, PC "
    "rated E 10+ (for Everyone 10 and Older). It's not available on Steam, Linux, or Mac."
)
print(response)

```python
inform(
    name["Dirt: Showdown"],
    release_year[2012],
    esrb["E 10+ (for Everyone 10 and Older)"],
    genres["driving/racing", "sport"],
    platforms["PlayStation", "Xbox", "PC"],
    available_on_steam[False],
    has_linux_release[False],
    has_mac_release[False]
)
```

With Ray, we can use batch processing to preprocess our dataset at scale.

<img src="assets/data-overview.png" width=500>

### Dataset

In [None]:
dataset: DatasetDict = load_dataset("GEM/viggo", trust_remote_code=True)  # type: ignore

def get_dataset(split: str) -> AnyscaleDataset:
    ray_dataset = ray.data.from_items(dataset[split]).map(to_schema)
    with get_dataset_file_path(ray_dataset) as dataset_file_path:
        anyscale_dataset = anyscale.llm.dataset.upload(
            dataset_file_path,
            name=f"viggo/{split}",
        )
    return anyscale_dataset

# Split the dataset into train, validation, and test sets
train_dataset = get_dataset("train")
val_dataset = get_dataset("validation")
test_dataset = get_dataset("test")

### Data Preprocessing

We'll use [Ray Data](https://docs.ray.io/) to load our dataset and apply preprocessing to batches of our data at scale.

We want to preprocess our data by converting it into the `system` / `user` / `assistant` conversation format that our LLM will recognize.

To apply our function on our dataset at scale, we can pass it to [`ray.data.Dataset.map`](https://docs.ray.io/en/latest/data/api/doc/ray.data.Dataset.map.html).

<img src="assets/data-detailed.png" width=800>


### Save and load data

Let's save the LLM dataset splits to the cloud for future access. We can do this with an Anyscale Dataset.

## 2. Fine-tuning

Next, let's fine-tune a large language model (LLM) using our dataset from the previous data preprocessing step. We'll be fine-tuning Meta's Llama 3, 8B model.

<img src="assets/train-overview.png" width=500>

### Configurations

Anyscale provides a set of configurations for fine-tuning LLMs. Let's choose the default configuration, but load our dataset from the previous step.

### Fine-tuning

Now, let's kick off fine-tuning with an Anyscale job. By fine-tuning with [`llmforge`](https://docs.anyscale.com/llms/finetuning/intro/) and Ray Train, we'll be able to execute fine-tuning in parallel across multiple devices.

<img src="assets/train-detailed.png" width=550>

In [None]:
from anyscale.job import JobConfig

update_datasets_in_fine_tuning_config("configs/training/lora/llama-3-8b.yaml", train_dataset, val_dataset)
job_config = JobConfig.from_yaml("deploy/jobs/ft.yaml")
job_id = anyscale.job.submit(job_config)  # type: ignore

### Load artifacts

Once fine-tuning is complete, we can load info about the fine-tuned model, and download its model artifacts locally.

In [None]:
from src.utils import download_files_from_remote

job_id = "prodjob_lgcmhahdme45fc4hbyah82m6a7"
model_info = anyscale.llm.model.get(job_id=job_id)  # type: ignore
print(model_info)

# Download artifacts
local_dir = f'/mnt/cluster_storage/{model_info.id}'  # Storage accessible by head and worker nodes
download_files_from_remote(model_info.storage_uri, local_dir)

## 3. Serving

For model serving, we'll launch an Anyscale service. With `rayllm` and Ray Serve, our service can autoscale to meet any demand.

<img src="assets/online-overview.png" width=500>

Let's also define a function to query our LLM service with.

### Production service

In [None]:
job_id = "prodjob_lgcmhahdme45fc4hbyah82m6a7"  # e2e-llm-workflows

fine_tuned_model = anyscale.llm.model.get(job_id=job_id)  # type: ignore
response = query(
    fine_tuned_model.id,
    prompt="Dirt: Showdown from 2012 is a sport racing game for the PlayStation, Xbox, PC "
    "rated E 10+ (for Everyone 10 and Older). It's not available on Steam, Linux, or Mac."
)
print(response)

```python
inform(
    name["Dirt: Showdown"],
    release_year[2012],
    esrb["E 10+ (for Everyone 10 and Older)"],
    genres["driving/racing", "sport"],
    platforms["PlayStation", "Xbox", "PC"],
    available_on_steam[False],
    has_linux_release[False],
    has_mac_release[False]
)
```

## Dev → Prod

We've now served our model into production via [Anyscale Services](https://docs.anyscale.com/examples/intro-services/) but we can just easily productionize our other workloads with [Anyscale Jobs](https://docs.anyscale.com/examples/intro-jobs/) (like we did for fine-tuning above) to execute this entire workflow completely programmatically outside of Workspaces.

<img src="assets/jobs.png" width=650>

For example, suppose that we want to preprocess batches of new incoming data, fine-tune a model, evaluate it and then compare it to the existing production version. All of this can be productionized by simply launching the workload as a [Job](https://docs.anyscale.com/examples/intro-jobs), which can be triggered manually, periodically (cron) or event-based (via webhooks, etc.). We also provide integrations with your platform/tools to make all of this connect with your existing production workflows.

<img src="assets/ai-platform.png" width=650>

## 4. (Optional) Evaluation

We can evaluate our fine-tuned LLM to see how well it performs on our task. We'll start by performing offline batch inference where we will use our fine-tuned model to generate the outputs.

<img src="assets/offline-overview.png" width=500>

### Load test data

In [None]:
# Load test set for eval
ft_test_ds = ray.data.read_json(test_dataset.storage_uri)
test_data = ft_test_ds.take_all()
test_data[0]

In [None]:
# Separate into inputs/outputs
test_inputs = []
test_outputs = []
for item in test_data:
    test_inputs.append([message for message in item['messages'] if message['role'] != 'assistant'])
    test_outputs.append([message for message in item['messages'] if message['role'] == 'assistant'])

### Tokenizer

We'll also load the appropriate tokenizer to apply to our input data.

In [None]:
from transformers import AutoTokenizer

# Model and tokenizer
HF_MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)

### Chat template

When we fine-tuned our model, special tokens (ex. beginning/end of text, etc.) were automatically added to our inputs. We want to apply the same special tokens to our inputs prior to generating outputs using our tuned model. Luckily, the chat template to apply to our inputs (and add those tokens) is readily available inside our tuned model's `tokenizer_config.json` file. We can use our tokenizer to apply this template to our inputs.

In [None]:
import json

In [None]:
# Extract chat template used during fine-tuning
with open(os.path.join(local_dir, 'tokenizer_config.json')) as file:
    tokenizer_config = json.load(file)
chat_template = tokenizer_config['chat_template']
print (chat_template)

In [None]:
# Apply chat template
test_input_prompts = [{'inputs': tokenizer.apply_chat_template(
    conversation=inputs,
    chat_template=chat_template,
    add_generation_prompt=True,
    tokenize=False,
    return_tensors='np'), 'outputs': outputs} for inputs, outputs in zip(test_inputs, test_outputs)]
test_input_prompts_ds = ray.data.from_items(test_input_prompts)
print (test_input_prompts_ds.take(1))

### Batch inference

We will use [vLLM](https://github.com/vllm-project/vllm)'s offline LLM class to load the model and use it for inference. We can easily load our LoRA weights and merge them with the base model (just pass in `lora_path`). And we'll wrap all of this functionality in a class that we can pass to [ray.data.Dataset.map_batches](https://docs.ray.io/en/latest/data/api/doc/ray.data.Dataset.map_batches.html) to apply batch inference at scale.

<img src="assets/offline-detailed.png" width=750>

In [None]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

In [None]:
class LLMPredictor:
    def __init__(self, hf_model, sampling_params, lora_path=None):
        self.llm = LLM(model=hf_model, enable_lora=bool(lora_path))
        self.sampling_params = sampling_params
        self.lora_path = lora_path

    def __call__(self, batch):
        if not self.lora_path:
            outputs = self.llm.generate(
                prompts=batch['inputs'],
                sampling_params=self.sampling_params)
        else:
            outputs = self.llm.generate(
                prompts=batch['inputs'],
                sampling_params=self.sampling_params,
                lora_request=LoRARequest('lora_adapter', 1, self.lora_path))
        inputs = []
        generated_outputs = []
        for output in outputs:
            inputs.append(output.prompt)
            generated_outputs.append(' '.join([o.text for o in output.outputs]))
        return {
            'prompt': inputs,
            'expected_output': batch['outputs'],
            'generated_text': generated_outputs,
        }

During our data preprocessing template, we used the default compute strategy with `map_batches`. But this time we'll specify a custom compute strategy (`concurrency`, `num_gpus`, `batch_size` and `accelerator_type`).

In [None]:
# Fine-tuned model
hf_model = 'meta-llama/Meta-Llama-3-8B-Instruct'
sampling_params = SamplingParams(temperature=0, max_tokens=2048)
ft_pred_ds = test_input_prompts_ds.map_batches(
    LLMPredictor,
    concurrency=4,  # number of LLM instances
    num_gpus=1,     # GPUs per LLM instance
    batch_size=10,  # maximize until OOM, if OOM then decrease batch_size
    fn_constructor_kwargs={
        'hf_model': hf_model,
        'sampling_params': sampling_params,
        'lora_path': local_dir,
    },
    accelerator_type='A10G',  # A10G or L4
)

In [None]:
# Batch inference will take ~4 minutes
ft_pred = ft_pred_ds.take_all()
ft_pred[3]

### Evaluation

In [None]:
# Exact match (strict!)
matches = 0
mismatches = []
for item in ft_pred:
    if item['expected_output'][0]['content'] == item['generated_text'].split('<|eot_id|>')[0]:
        matches += 1
    else:
        mismatches.append(item)
matches / float(len(ft_pred))

**Note**: you can train for more epochs (`num_epochs: 10`) to further improve the performance.

Even our mismatches are not too far off and sometimes it might be worth a closer look because the dataset itself might have a few errors that the model may have identified.

In [None]:
# Inspect a few of the mismatches
mismatches[0:2]

## Clean up

In [None]:
# Clean up
!python src/clear_cell_nums.py
!find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
!find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
!rm -rf __pycache__ data .HF_TOKEN deploy/services

## Next steps

We have a lot more guides that address more nuanced use cases:

Fine-tuning:
- [Control over 50+ hyperparameters](https://docs.anyscale.com/llms/finetuning/guides/modify_hyperparams/)
- [Fine-tune any HF model](https://docs.anyscale.com/llms/finetuning/guides/bring_any_hf_model/)
- [Full-parameter or LoRA fine-tuning](https://docs.anyscale.com/llms/finetuning/guides/lora_vs_full_param/)
- [Classification fine-tuning / Routing](https://www.anyscale.com/blog/building-an-llm-router-for-high-quality-and-cost-effective-responses)
- [Function calling fine-tuning](https://github.com/anyscale/templates/blob/main/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb)
- [Longer context fine-tuning](https://www.anyscale.com/blog/fine-tuning-llms-for-longer-context-and-better-rag-systems)
- [Continued fine-tuning from checkpoint](https://github.com/anyscale/templates/tree/main/templates/fine-tune-llm_v2/cookbooks/continue_from_checkpoint)
- Training on more available hardware (ex. A10s) with model parallelism
- [End-to-end LLM workflows (including batch data processing, batch inference)](https://www.anyscale.com/blog/end-to-end-llm-workflows-guide)
- Distillation (Coming in <2 weeks)

Serving:
- [Deploy with autoscaling + optimize for latency vs. throughput](https://docs.anyscale.com/examples/deploy-llms/)
- [Serving multiple LoRA adapters](https://docs.anyscale.com/llms/serving/guides/multi_lora/)
- [Migration from OpenAI](https://docs.anyscale.com/llms/serving/guides/openai_to_oss/)
- [Spot to on-demand fallback (vice versa)](https://docs.anyscale.com/1.0.0/configure/compute-configs/ondemand-to-spot-fallback/)
- [Batch inference with vLLM](https://docs.anyscale.com/examples/batch-llm/)

And more!
- [Batch text embeddings with Ray data](https://github.com/anyscale/templates/tree/main/templates/text-embeddings)
- [Production RAG applications](https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1)
- [Router](https://github.com/anyscale/llm-router) between different models (base, fine-tuned, closed-source) to optimize for cost and quality
- Stable diffusion [fine-tuning](https://github.com/anyscale/templates/tree/main/templates/fine-tune-stable-diffusion) and [serving](https://github.com/anyscale/templates/tree/main/templates/serve-stable-diffusion)

And if you're interested in using our hosted Anyscale or connecting it to your own cloud, reach out to us at [Anyscale](https://www.anyscale.com/get-started?utm_source=goku). And follow us on [Twitter](https://x.com/anyscalecompute) and [LinkedIn](https://www.linkedin.com/company/joinanyscale/) for more real-time updates on new features!