In [None]:
!pip install --upgrade --quiet sagemaker boto3 

In [None]:
import sagemaker 
import boto3 
from sagemaker.huggingface import HuggingFaceModel,  get_huggingface_llm_image_uri
import json
sagemaker.__version__, boto3.__version__

In [None]:
sess = sagemaker.Session()

In [None]:
aws_region='us-west-2'
endpoint_name = "mixtral-8x7b-instruct-smep"

In [None]:
sagemaker_session_bucket = sess.default_bucket()
sagemaker_session_bucket

In [None]:
role = sagemaker.get_execution_role()
role

In [None]:
hub = {
    'HF_MODEL_ID' : 'mistralai/Mixtral-8x7B-Instruct-v0.1',
    'REVISION': "e0bbb53cee412aba95f3b3fa4fc0265b1a0788b2", ## <<== temporary measure because of a bug in the upstream version of the model
    'SM_NUM_GPUS': json.dumps(8)
}

In [None]:
## once version 1.3.3 is available, remove REVISON field in the previous cell
mixtral_8x7b_image_uri = get_huggingface_llm_image_uri("huggingface",version="1.3.1") 
mixtral_8x7b_image_uri

In [None]:
# create Hugging Face Model Class
mistral_8x7b_model = HuggingFaceModel(
	image_uri=mixtral_8x7b_image_uri,
	env=hub,
	role=role, 
)

In [None]:
# deploy model to SageMaker Inference asynchronously
predictor = mistral_8x7b_model.deploy(
	initial_instance_count=1,
    endpoint_name = endpoint_name,
	instance_type="ml.g5.48xlarge",
	container_startup_health_check_timeout=300,
    wait=False,
  )

In [None]:
smc = boto3.Session().client('sagemaker')


In [None]:
## wait till the status changes to 'InService'
## you can run this cell again to refresh status
response = smc.describe_endpoint(
    EndpointName=endpoint_name,
)
print(f"{response['EndpointName']} status is {response['EndpointStatus']}")

## decouple endpoint deployment from inference by using boto3 client 

In [None]:
inp_json = {
	"inputs": "[INST] Explain what a Mixture of Experts is in less than 100 words. [/INST]",
    "parameters": {
        "do_sample": True,
        "top_p": 0.6,
        "temperature": 0.9,
        "top_k": 50,
        "max_new_tokens": 1024,
        "repetition_penalty": 1.03,
        "return_full_text": False,
        "stop": ["</s>"]
    }
}
inp_request = json.dumps(inp_json)

In [None]:
smr =  boto3.client('sagemaker-runtime')
response = smr.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=inp_request,
    ContentType='application/json'
)
result = json.loads(response['Body'].read().decode())

In [None]:
print(result[0]['generated_text'])

In [None]:
## cleanup after you done! 
smc.delete_endpoint(EndpointName=endpoint_name)
smc.delete_endpoint_config(EndpointConfigName=endpoint_name)