### 1. Update environment

In [None]:
!pip install --upgrade boto3 
!pip install --upgrade sagemaker 
!pip install nvidia-pyindex && pip install nvidia-cublas
!sudo yum update -y
!sudo yum install git-lfs git -y

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:

if "cn-" in region:
    repository = "TheBloke/Llama-2-13B-chat-GGML" #todo:find one repo
    url_suffix = repository + ".git"
    repo = "https://www.wisemodel.cn/" + url_suffix
else:
    repository = "TheBloke/Firefly-Llama2-13B-v1.2-GGUF" #todo:find one repo
    repo = "https://huggingface.co/TheBloke/Firefly-Llama2-13B-v1.2-GGUF/resolve/main/firefly-llama2-13b-v1.2.Q2_K.gguf"
model_id=repository.split("/")[-1]
s3_model_prefix =model_id
s3_code_prefix=model_id
bucket = "sagemaker-"+region+"-" + account_id  #specify your s3 dir to store model repo example-> s3://your-bucket-name/your-s3-dir
s3 = boto3.client('s3')
directory_name = model_id  #it's name of your folders
s3.put_object(Bucket=bucket, Key=(directory_name+'/'))



In [None]:
!mkdir $model_id


In [None]:
s3_location = "s3://"+bucket + "/" + model_id + "/"
local_model_dir = "./" + model_id + "/"

!echo $s3_location
!echo $repo
!echo $local_model_dir
!echo $model_id

In [None]:
# !git lfs install
# !git clone $repo
#!wget $repo -P $local_model_dir
!wget $repo

In [None]:
!cd $local_model_dir && rm -rf ".ipynb_checkpoints"
!touch $local_model_dir/model
!tar czvf model.tar.gz $local_model_dir/*

In [None]:
s3_model = sess.upload_data("model.tar.gz", bucket, directory_name)
#!aws s3 sync $local_model_dir $s3_location

### 2. Make docker image
##### Notice: you can modify Dockfile to ADD different model weights.

In [None]:
!docker build -t llama2-13b-cpp-python-sagemaker .

In [None]:
# AWS ECR Login,
!docker login -u AWS -p $(aws ecr get-login-password --region $region) https://{account_id}.dkr.ecr.{region}.amazonaws.com

# Create ECR Repo
!aws ecr create-repository --repository-name llama2-13b-cpp-python-sagemaker --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE

# Tag Image
!docker tag llama2-13b-cpp-python-sagemaker:latest {account_id}.dkr.ecr.{region}.amazonaws.com/llama2-13b-cpp-python-sagemaker:latest

#Push Image to ECR
!docker push {account_id}.dkr.ecr.{region}.amazonaws.com/llama2-13b-cpp-python-sagemaker:latest

### 3. Deploy Sagemaker Endpoint(Execute this python code snippet in juypter notebook or ec2)


In [None]:
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.session import Session
#ECR URI
image_uri = account_id+'.dkr.ecr.'+region+'.amazonaws.com/llama2-13b-cpp-python-sagemaker'
# This can be dummy model file
model_dir = s3_model

# Create the SageMaker model instance
model = Model(
    image_uri=image_uri,
    role=sagemaker.get_execution_role(),
    model_data=model_dir
)
endpoint_name = 'pytorch-inference-llm-v1'

import boto3

client = boto3.client('sagemaker')
try:
    response = client.delete_endpoint_config(EndpointConfigName=endpoint_name)
except:
    pass

model.deploy(
    instance_type='ml.p3.2xlarge',
    initial_instance_count=1,
    endpoint_name = endpoint_name,
)

#### Invoke Sagemaekr Endpoint


In [None]:
import boto3
import time

data = {"ask": "how to learn english?"}
runtime_sagemaker_client = boto3.client(service_name="sagemaker-runtime")

#endpoint_name = 'pytorch-inference-llm-v1'

body = json.dumps(data)

start = time.time()
response = runtime_sagemaker_client.invoke_endpoint(
    EndpointName = endpoint_name,
    ContentType  = "application/json",
    Body= body)

cost = time.time() - start     
result = response['Body'].read().decode('utf-8')

print('Response: ', result)
print("Cost Time:  %s seconds" % (cost))
print('Output Chars :', len(result))
print('Speed: {:.2f} Chars/s'.format(len(result)/float(cost)))

### 4.Clean Endpoint and config

In [None]:
# !aws sagemaker delete-endpoint --endpoint-name $endpoint_name
# !aws sagemaker delete-endpoint-config --endpoint-config-name $endpoint_name
