# GPT-J-6B Serving with Ray AIR

In this example, we will showcase how to use the Ray AIR for GPT-J serving (online inference). GPT-J is a GPT-2-like causal language model trained in our previous step


In [None]:
! pip install "ray[air]" boto3 "ray"
! pip install "datasets" "evaluate" "accelerate==0.20.3" "transformers>=4.26.0" "torch>=1.12.0" "deepspeed==0.8.3"
! pip install -U protobuf==3.19.6 xgboost==1.3.3 xgboost-ray==0.1.15 pandas==1.5.3 tensorboard
! pip install pandas --upgrade

In [None]:
import os
import ray
import boto3
from ray import serve
from ray.serve.http_adapters import pandas_read_json
from ray.train.huggingface import TransformersPredictor
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
ray.init(
    address="ray://ray-cluster-serve-kuberay-head-svc.ray-cluster-serve.svc.cluster.local:10001",
    namespace="serve",
    runtime_env={
        "pip": [
            "datasets",
            "evaluate",
            # Latest combination of accelerate==0.19.0 and transformers==4.29.0
            # seems to have issues with DeepSpeed process group initialization,
            # and will result in a batch_size validation problem.
            # TODO(jungong) : get rid of the pins once the issue is fixed.
            "accelerate==0.20.3",
            "transformers==4.26.0",
            "torch>=1.12.0",
        ]
    }
)

In [None]:
# Start Serve
serve.start(detached=True)

In [None]:
s3 = boto3.client("s3")
bucket = "fm-ops-datasets"
model_key = "checkpoints/TransformersTrainer_2023-09-05_12-25-24/TransformersTrainer_f638a_00000_0_2023-09-05_12-25-24/checkpoint_000000/pytorch_model.bin"
tokenizer_key = "checkpoints/TransformersTrainer_2023-09-05_12-25-24/TransformersTrainer_f638a_00000_0_2023-09-05_12-25-24/checkpoint_000000/tokenizer.json"
config_json_key = "checkpoints/TransformersTrainer_2023-09-05_12-25-24/TransformersTrainer_f638a_00000_0_2023-09-05_12-25-24/checkpoint_000000/config.json"

In [None]:
os.makedirs("local_model", exist_ok=True)

In [None]:
s3.download_file(bucket, model_key, "local_model/pytorch_model.bin")
s3.download_file(bucket, tokenizer_key, "local_model/tokenizer.json")
s3.download_file(bucket, config_json_key, "local_model/config.json")

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("local_model")
model = AutoModelForCausalLM.from_pretrained("local_model").cuda()  # Move model to GPU

In [None]:
ray.shutdown()

In [None]:
# serve.get_deployment("default_XGBoostService").url
serve.delete("default")
serve.shutdown()

In [None]:
from ray.job_submission import JobSubmissionClient

In [None]:
ray_client = JobSubmissionClient("http://ray-cluster-serve-kuberay-head-svc.ray-cluster-serve.svc.cluster.local:8265")

ray_serving = (
    "rm -rf fm-ops-eks && git clone https://github.com/lusoal/fm-ops-eks || true;"
    "chmod +x fm-ops-eks/scripts/serve_gptj.py && python fm-ops-eks/scripts/serve_gptj.py"
)

submission_id = ray_client.submit_job(
    entrypoint=ray_serving,
    runtime_env={"pip": ["boto3"]},
)
