# Deploy llama3 to Inf2

In [None]:
# updating packages
%pip install -U sagemaker
%pip install huggingface_hub

In [None]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers
from pathlib import Path

In [None]:
role_name = "HuggingfaceExecuteRole"
region_name = "us-west-2"
djl_framework="djl-neuronx"
djl_version="0.27.0"

iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName=role_name)['Role']['Arn']
sess = sagemaker.Session(boto_session=boto3.Session(profile_name="default"))


In [None]:
%%writefile serving.properties
engine=Python
option.entryPoint=djl_python.transformers_neuronx
option.model_id=meta-llama/Meta-Llama-3-8B-Instruct
option.max_rolling_batch_size=8
option.tensor_parallel_degree=24
option.n_positions=512
option.rolling_batch=auto
option.enable_mixed_precision_accumulation=true
option.enable_streaming=true
option.output_formatter=jsonlines
option.model_loading_timeout=3600

In [None]:
%%bash
mkdir mymodel
mv serving.properties mymodel/
tar czvf mymodel.tar.gz mymodel/
rm -rf mymodel

In [None]:
s3_code_prefix = "llama3-8b-instruct-lmi/code"
bucket = sess.default_bucket()  # bucket to house artifacts
code_artifact = sess.upload_data("mymodel.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- &gt; {code_artifact}")

In [None]:
image_uri = image_uris.retrieve(
        framework=djl_framework,
        region=region_name,
        version=djl_version
    )

image_uri

In [None]:
model = Model(image_uri=image_uri, model_data=code_artifact, role=role)
instance_type = "ml.inf2.8xlarge"
endpoint_name = "llama3-8b-instruct-model" 
# sagemaker.utils.name_from_base("llama3-8b-instruct-model")

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             container_startup_health_check_timeout=3600,
             volume_size=512,
             region=region_name,
             endpoint_name=endpoint_name)
region_name