# LLM Domain Adaptation with ORPO, AWS Trainium and AWS Inferentia2

**SageMaker Studio**: Jupyter Lab  
**Kernel**: Python3  

This exercise is divide into 2 parts:
 - Data prep + model alignment
 - **Model deployment + tests**

In this notebook you'll run the second part. You start by running a SageMaker job to compile your aligned mode (in the previous notebook) to AWS Inferentia2.
Then you deploy a SageMaker endpoint with the compiled model and run some tests.

In [None]:
%pip install -U sagemaker

In [None]:
import os
import boto3
import sagemaker

print(sagemaker.__version__)
if not sagemaker.__version__ >= "2.146.0": print("You need to upgrade or restart the kernel if you already upgraded")

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
region = sess.boto_region_name

training_job_name=""
HF_TOKEN=""
if os.path.isfile("training_job_name.txt"): 
    lines = open("training_job_name.txt", "r").readlines()
    training_job_name = lines[0].strip()
    HF_TOKEN = lines[1].strip()
assert len(training_job_name)>0, "Please copy the name of the training_job you ran in the previous notebook and set training_job_name"

checkpoint_s3_uri=f"s3://{bucket}/output/{training_job_name}/output/model"

os.makedirs("src", exist_ok=True)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")
print(f"Training job name: {training_job_name}")
print(f"Model S3 URI: {checkpoint_s3_uri}")
print(f"HF Token found? {HF_TOKEN != ''}")

## 1) Create compile/deploy artifacts
### 1.1) Dependencies descriptor
Installing the libraries listed in this file will be the first thing SageMaker will do.

In [None]:
%%writefile src/requirements.txt
--extra-index-url https://pip.repos.neuron.amazonaws.com
git+https://github.com/huggingface/optimum-neuron@02c331d
trl==0.11.4
peft==0.13.2
neuronx-cc==2.15.128.0+56dc5a86
transformers-neuronx

### 1.2) Compile and deployment script
The code executed inside __main__ will be used to compile the model. However, the same script will then be used to deploy a SageMaker endpoint later.
For the model deployment, only the methods defined before __main__ will be used by SageMaker, for instance: **model_fn**, **predict_fn**, etc.

In [None]:
%%writefile src/compile.py
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

import os
import sys
import glob
import json
import torch
import shutil
import tarfile
import logging
import argparse
import traceback
from peft import PeftModel
from trl import setup_chat_format
from huggingface_hub import login
from optimum.neuron import NeuronModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer

def model_fn(model_dir, context=None):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = NeuronModelForCausalLM.from_pretrained(model_dir)
    return model,tokenizer

def input_fn(input_data, content_type, context=None):
    if content_type == 'application/json':
        req = json.loads(input_data)
        prompt = req.get('prompt')
        temperature = req.get('temperature', 0.8)
        top_p = req.get('top_p', 0.9)
        if prompt is None or len(prompt) < 3:
            raise("Invalid prompt. Provide an input like: {'prompt': 'text text text'}")
        return prompt,temperature,top_p
    else:
        raise Exception(f"Unsupported mime type: {content_type}. Supported: application/json")    

def predict_fn(input_object, model_tokenizer, context=None):
    model,tokenizer = model_tokenizer
    prompt,temperature,top_p = input_object

    messages = [{'content': prompt, 'role': 'user'}, {'content': '', 'role': 'assistant'}]

    # Test on sample
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_new_tokens=int(os.environ.get("MAX_SEQ_LEN", 512)),
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )
    
    response = outputs[0][input_ids.shape[-1]:] # remove input from output
    response = tokenizer.decode(response, skip_special_tokens=True)
    return {"response": response}

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--tp_degree", type=int, default=2)
    parser.add_argument("--max_seq_len", type=int, default=512)
    parser.add_argument("--max_prompt_len", type=int, default=256)
    parser.add_argument("--hf_token", type=str, default=None)
    parser.add_argument("--model_id", type=str, default="meta-llama/Llama-3.2-1B")

    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])    
    parser.add_argument("--checkpoint_dir", type=str, default=os.environ["SM_CHANNEL_CHECKPOINT"])
    
    args, _ = parser.parse_known_args()
    if not args.hf_token is None and len(args.hf_token) > 0:
        print("HF token defined. Logging in...")
        login(token=args.hf_token)
    
    compiler_args = {"num_cores": args.tp_degree, "auto_cast_type": 'bf16'}
    input_shapes = {"batch_size": args.batch_size, "sequence_length": args.max_seq_len, "prompt_len": args.max_prompt_len}

    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_dir)
    model = AutoModelForCausalLM.from_pretrained(args.model_id).bfloat16()
    model, tokenizer = setup_chat_format(model, tokenizer)
    model = PeftModel.from_pretrained(
        model,  # The base model to be used for prompt tuning
        args.checkpoint_dir,   # The path where the trained Peft model is saved
        is_trainable=False  # Indicates that the loaded model should not be trainable
    )
    model = model.merge_and_unload()
    model.save_pretrained("merged_model")
    neuron_model = NeuronModelForCausalLM.from_pretrained("merged_model", export=True,
        **compiler_args,
        **input_shapes)
    neuron_model.save_pretrained(args.model_dir)
    tokenizer.save_pretrained(args.model_dir)
    
    code_path = os.path.join(args.model_dir, 'code')
    os.makedirs(code_path, exist_ok=True)

    shutil.copy(__file__, os.path.join(code_path, "inference.py"))
    shutil.copy('requirements.txt', os.path.join(code_path, 'requirements.txt'))

## 2) Kick-off the compiling job
First we create a [SageMaker Estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) with all the parameters we need to launch a compiling job.

It takes ~9 mins to compile a Llama3.2-1B model using 1 trn1.2xlarge.

In [None]:
import json
import logging
from sagemaker.pytorch import PyTorch

tp_degree=2
max_seq_len=512

hyperparameters={
    "max_seq_len": max_seq_len,
    "max_prompt_len": 256,
    "tp_degree": tp_degree,
    "batch_size": 1,
    "model_id": "meta-llama/Llama-3.2-1B"
}

if HF_TOKEN and len(HF_TOKEN) > 3:
    hyperparameters["hf_token"]= HF_TOKEN
    
estimator = PyTorch(
    entry_point="compile.py", # Specify your train script
    source_dir="src",
    role=role,
    sagemaker_session=sess,
    container_log_level=logging.DEBUG,
    instance_count=1,
    instance_type='ml.trn1.2xlarge',
    output_path=f"s3://{bucket}/output",
    disable_profiler=True,
    disable_output_compression=True,
    
    image_uri=f"763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04",
    env={
        'NEURON_RT_NUM_CORES': str(tp_degree)
    },
    volume_size = 512,
    hyperparameters=hyperparameters
)

In [None]:
## This will take ~9mins
estimator.fit({"checkpoint": checkpoint_s3_uri})

## 3) Deploy the compiled model to a SageMaker endpoint

In [None]:
import logging
from sagemaker.utils import name_from_base
from sagemaker.pytorch.model import PyTorchModel

model_data=estimator.model_data
print(f"Model data: {model_data}")

instance_type="ml.inf2.xlarge"
num_workers=1

print(f"Instance type: {instance_type}. Num SM workers: {num_workers}")
pytorch_model = PyTorchModel(
    image_uri=f"763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04",
    model_data=model_data,
    role=role,    
    name=name_from_base('orpo-llama3'),
    sagemaker_session=sess,
    container_log_level=logging.DEBUG,
    model_server_workers=num_workers,
    framework_version="2.1.2",
    env = {
        'SAGEMAKER_MODEL_SERVER_TIMEOUT': '3600',
        'MAX_SEQ_LEN': str(max_seq_len),
        'NEURON_RT_NUM_CORES': str(tp_degree)
    }
    # for production it is important to define vpc_config and use a vpc_endpoint
    #vpc_config={
    #    'Subnets': ['<SUBNET1>', '<SUBNET2>'],
    #    'SecurityGroupIds': ['<SECURITYGROUP1>', '<DEFAULTSECURITYGROUP>']
    #}
)
pytorch_model._is_compiled_model = True

In [None]:
predictor = pytorch_model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    model_data_download_timeout=3600, # it takes some time to download all the artifacts and load the model
    container_startup_health_check_timeout=1800
)

## 4) Run some tests

In [None]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

In [None]:
import time

#prompt = "Explain the traditional techniques involved in cultivating a Bonsai tree."
prompt = "June and Julia live 1 mile apart. It takes June 4 minutes to ride her bike directly to Julia's house. At the same rate, how many minutes would it take June to ride the 3.5 miles from her own house to Bernard's house?"
t=time.time()
pred = predictor.predict({"prompt": prompt})
elapsed = (time.time()-t)*1000
print(f"Elapsed time: {elapsed}ms")
print(f"Pred: {pred['response']}")

Done! :)