# Deploy fine-tuned llama-7B model

In [None]:
%pip install -U sagemaker -q

In [58]:
import logging 
sagemaker_config_logger = logging.getLogger("sagemaker.config") 
sagemaker_config_logger.setLevel(logging.WARNING)

# Import SageMaker SDK, setup our session
import sagemaker
from sagemaker import Model, image_uris, serializers

sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
role = sagemaker.get_execution_role()  # execution role for the endpoint

In [None]:
image_uri = image_uris.retrieve(
        framework="djl-neuronx",
        region=sess.boto_session.region_name,
        version="0.24.0"
    )
image_uri

In [None]:
%%writefile serving.properties
engine=Python
option.entryPoint=djl_python.transformers_neuronx
option.model_id=##Paste S3 url of the model artifacts from the previous file
option.batch_size=1
option.tensor_parallel_degree=2
option.load_in_8bit=false
option.n_positions=512
option.rolling_batch=auto
option.dtype=fp16
option.model_loading_timeout=3600

In [None]:
%%sh
mkdir mycode
mv serving.properties mycode/
tar czvf mycode.tar.gz mycode/
rm -rf mycode

In [None]:
s3_code_prefix = "inference/large-model-lmi/code"
bucket = sess.default_bucket()  # bucket to house artifacts
code_artifact = sess.upload_data("mycode.tar.gz", bucket, s3_code_prefix)
print(f"Code uploaded to --- > {code_artifact}")

In [63]:
instance_type = "ml.inf2.24xlarge"
endpoint_name = sagemaker.utils.name_from_base("llama-finetuned-model")

In [None]:
model = Model(image_uri=image_uri, model_data=code_artifact, role=role)

model._is_compiled_model = True

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             container_startup_health_check_timeout=3600,
             volume_size=256,
             endpoint_name=endpoint_name)

In [None]:
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=serializers.JSONSerializer()
)

In [79]:
prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is an egg laying mammal?\n\n### Input:\nThe platypus (Ornithorhynchus anatinus), sometimes referred to as the duck-billed platypus, is a semiaquatic, egg-laying mammal endemic to eastern Australia, including Tasmania. The platypus is the sole living representative or monotypic taxon of its family (Ornithorhynchidae) and genus (Ornithorhynchus), though a number of related species appear in the fossil record.\n\n\n\n### Response:\n"

In [None]:
# Getting the response from the model
response = predictor.predict({"inputs": prompt, "parameters": {"max_new_tokens": 100}})

# Decoding the byte string to a regular string
# Assuming the response is JSON encoded as a byte string
import json
decoded_response = json.loads(response.decode('utf-8'))

# Extracting the generated text from the decoded response
generated_text = decoded_response['generated_text']
generated_response = generated_text.split('\n', 1)[0]

print(generated_response)


In [None]:
##Cleanup
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
model.delete_model()