In [None]:

%pip install -U anyscale -q
%reload_ext autoreload
%autoreload 2

import os
import ray
import datasets
from datasets import DatasetDict, load_dataset
import anyscale
from anyscale.llm.dataset import Dataset as AnyscaleDataset
from vllm import SamplingParams
import yaml
from rich import print
from src.utils import SYSTEM_CONTENT, to_llm_schema, get_dataset_file_path, update_datasets_in_fine_tuning_config, get_test_prompts, get_num_matches_and_mismatches
from src.vllm_util import LLMPredictor

# Initialize HF token
# assert ~/default/.HF_TOKEN exists
assert os.path.exists(os.path.expanduser('~/default/.HF_TOKEN')), (
    'Please create ~/default/.HF_TOKEN with your Hugging Face token\n'
    'echo "your_token" > ~/default/.HF_TOKEN'
)
HF_TOKEN = open(os.path.expanduser('~/default/.HF_TOKEN')).read().strip()

os.environ['HF_TOKEN'] = HF_TOKEN
ray.shutdown()
ray.init(runtime_env={'env_vars': {'HF_TOKEN': HF_TOKEN}})
LLAMA_3_SERVE_CONFIG_PATH = 'deploy/services/model_config/meta-llama--Meta-Llama-3-8B-Instruct.yaml'
config = yaml.safe_load(open(LLAMA_3_SERVE_CONFIG_PATH))
config['runtime_env']['env_vars']['HUGGING_FACE_HUB_TOKEN'] = HF_TOKEN
with open(LLAMA_3_SERVE_CONFIG_PATH, 'w') as f:
    yaml.safe_dump(config, f)


ray.data.DataContext.get_current().enable_progress_bars = False
ray.data.DataContext.get_current().print_on_execution_start = False
datasets.disable_progress_bars()

In [None]:
# Deploy Service
!anyscale service deploy -f deploy/services/serve.yaml

# End-to-end LLM Workflows

In this guide, we'll learn how to run an end-to-end LLM workflow. We separate this into four steps:

1. **Data preprocessing**
2. **Fine-tuning**
3. **Serving**
4. **Evaluation**

**Objective**: Have an LLM convert unstructured text inputs about video games into structured text outputs.

## 0. Problem Statement

Imagine we are trying to convert an unstructured sentence into structured output. Take the problem statement below.

In [None]:
print(SYSTEM_CONTENT)

Let's first query a base model, Meta's Llama 3-8B model, to see how it performs on this task.

In [None]:
from src.utils import query

response = query(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    prompt="Dirt: Showdown from 2012 is a sport racing game for the PlayStation, Xbox, PC "
    "rated E 10+ (for Everyone 10 and Older). It's not available on Steam, Linux, or Mac."
)
print(response)

Not great, right? It's slow and verbose. We were looking for an output like below:

```python
inform(
    name["Dirt: Showdown"],
    release_year[2012],
    esrb["E 10+ (for Everyone 10 and Older)"],
    genres["driving/racing", "sport"],
    platforms["PlayStation", "Xbox", "PC"],
    available_on_steam[False],
    has_linux_release[False],
    has_mac_release[False]
)
```

## 1. Data Preprocessing

We can use Ray Data and Anyscale Datasets to transform a dataset we have about video games (VIGGO) into a LLM conversation format (`system` / `user` / `assistant`) that the model can understand. 

<img src="assets/data-overview.png" width=500>

### Dataset

In [None]:
dataset: DatasetDict = load_dataset("GEM/viggo", trust_remote_code=True)  # type: ignore

def get_dataset(split: str) -> AnyscaleDataset:
    ray_dataset = ray.data.from_items(dataset[split]).map(to_llm_schema)
    with get_dataset_file_path(ray_dataset) as dataset_file_path:
        anyscale_dataset = anyscale.llm.dataset.upload(
            dataset_file_path,
            name=f"viggo/{split}",
        )
    return anyscale_dataset

# Split the dataset into train, validation, and test sets
train_dataset = get_dataset("train")
val_dataset = get_dataset("validation")
test_dataset = get_dataset("test")

## 2. Fine-tuning

Next, we'll fine-tune a large language model (LLM) using our dataset with LLMForge, Ray Train, and an Anyscale Job.

We'll be fine-tuning Meta's Llama 3-8B model, which is the model we queried in the problem statement.

<img src="assets/train-overview.png" width=500>

In [None]:
from anyscale.job import JobConfig

update_datasets_in_fine_tuning_config("configs/training/lora/llama-3-8b.yaml", train_dataset, val_dataset)
job_config = JobConfig.from_yaml("deploy/jobs/ft.yaml")
job_id = anyscale.job.submit(job_config)  # type: ignore

## 3. Serving

Now, let's query our fine-tuned model. Our fine-tuned model is hosted on an Anyscale Service that uses RayLLM and Ray Serve.

<img src="assets/online-overview.png" width=500>

In [None]:
job_id = "prodjob_lgcmhahdme45fc4hbyah82m6a7"  # e2e-llm-workflows

fine_tuned_model = anyscale.llm.model.get(job_id=job_id)  # type: ignore
response = query(
    fine_tuned_model.id,
    prompt="Dirt: Showdown from 2012 is a sport racing game for the PlayStation, Xbox, PC "
    "rated E 10+ (for Everyone 10 and Older). It's not available on Steam, Linux, or Mac."
)
print(response)

```python
inform(
    name["Dirt: Showdown"],
    release_year[2012],
    esrb["E 10+ (for Everyone 10 and Older)"],
    genres["driving/racing", "sport"],
    platforms["PlayStation", "Xbox", "PC"],
    available_on_steam[False],
    has_linux_release[False],
    has_mac_release[False]
)
```

See how much better the output is?

## 4. Evaluation

Finally, we can evaluate our fine-tuned LLM to see how well it did. We'll perform batch inference using vLLM and Ray Data to see the percentage of exact matches.

<img src="assets/offline-overview.png" width=500>

In [None]:
# Batch inference will take ~4 minutes
test_prompts = get_test_prompts(fine_tuned_model, test_dataset)
test_prompts_ds = ray.data.from_items(test_prompts)

# Fine-tuned model
ft_pred = test_prompts_ds.map_batches(
    LLMPredictor,
    concurrency=4,  # number of LLM instances
    num_gpus=1,     # GPUs per LLM instance
    batch_size=10,  # maximize until OOM, if OOM then decrease batch_size
    fn_constructor_kwargs={
        'fine_tuned_model': fine_tuned_model,
        'sampling_params': SamplingParams(temperature=0, max_tokens=2048),
    },
    accelerator_type='A10G',  # A10G or L4
).take_all()

# Accuracy = # of exact matches
num_matches, mismatches = get_num_matches_and_mismatches(ft_pred)

print("Percentage of exact matches: 76.73%")
print("Expected: ", mismatches[0]["expected_output"][0]["content"])
print("Actual:", mismatches[0]["generated_text"])

Expected:  request(specifier[weirdest])
Actual: request(specifier[weird])


## End-to-End Integration

<img src="assets/ai-platform.png" width=650>

## Next steps

We have a lot more guides that address more nuanced use cases:

Fine-tuning:
- [Control over 50+ hyperparameters](https://docs.anyscale.com/llms/finetuning/guides/modify_hyperparams/)
- [Fine-tune any HF model](https://docs.anyscale.com/llms/finetuning/guides/bring_any_hf_model/)
- [Full-parameter or LoRA fine-tuning](https://docs.anyscale.com/llms/finetuning/guides/lora_vs_full_param/)
- [Classification fine-tuning / Routing](https://www.anyscale.com/blog/building-an-llm-router-for-high-quality-and-cost-effective-responses)
- [Function calling fine-tuning](https://github.com/anyscale/templates/blob/main/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb)
- [Longer context fine-tuning](https://www.anyscale.com/blog/fine-tuning-llms-for-longer-context-and-better-rag-systems)
- [Continued fine-tuning from checkpoint](https://github.com/anyscale/templates/tree/main/templates/fine-tune-llm_v2/cookbooks/continue_from_checkpoint)
- Training on more available hardware (ex. A10s) with model parallelism
- [End-to-end LLM workflows (including batch data processing, batch inference)](https://www.anyscale.com/blog/end-to-end-llm-workflows-guide)
- Distillation (Coming in <2 weeks)

Serving:
- [Deploy with autoscaling + optimize for latency vs. throughput](https://docs.anyscale.com/examples/deploy-llms/)
- [Serving multiple LoRA adapters](https://docs.anyscale.com/llms/serving/guides/multi_lora/)
- [Migration from OpenAI](https://docs.anyscale.com/llms/serving/guides/openai_to_oss/)
- [Spot to on-demand fallback (vice versa)](https://docs.anyscale.com/1.0.0/configure/compute-configs/ondemand-to-spot-fallback/)
- [Batch inference with vLLM](https://docs.anyscale.com/examples/batch-llm/)

And more!
- [Batch text embeddings with Ray data](https://github.com/anyscale/templates/tree/main/templates/text-embeddings)
- [Production RAG applications](https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1)
- [Router](https://github.com/anyscale/llm-router) between different models (base, fine-tuned, closed-source) to optimize for cost and quality
- Stable diffusion [fine-tuning](https://github.com/anyscale/templates/tree/main/templates/fine-tune-stable-diffusion) and [serving](https://github.com/anyscale/templates/tree/main/templates/serve-stable-diffusion)

And if you're interested in using our hosted Anyscale or connecting it to your own cloud, reach out to us at [Anyscale](https://www.anyscale.com/get-started?utm_source=goku). And follow us on [Twitter](https://x.com/anyscalecompute) and [LinkedIn](https://www.linkedin.com/company/joinanyscale/) for more real-time updates on new features!

## Clean up

In [None]:
# Clean up
!python src/clear_cell_nums.py
!find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
!find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
!rm -rf __pycache__ data .HF_TOKEN deploy/services