In [10]:
roleName = "SageMakerForSearch"

In [32]:
import json

In [11]:
import sagemaker
import time 
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName=roleName)['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

Couldn't call 'get_role' to get Role ARN from role name Administrator to get Role path.


sagemaker role arn: arn:aws:iam::113170463366:role/SageMakerForSearch
sagemaker session region: us-east-1


In [12]:
sagemaker_session_bucket

'sagemaker-us-east-1-113170463366'

In [13]:
from sagemaker.huggingface import get_huggingface_llm_image_uri,HuggingFaceModel

In [14]:
region = "us-east-1"

In [15]:
# deploy the Model.
image_uri = get_huggingface_llm_image_uri(
  backend="huggingface", # or lmi
  region=region
)

model_name = "falcon-7b--" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

hub = {
    'HF_MODEL_ID':'tiiuae/falcon-7b',
    'HF_TASK':'question-answering',
    'SM_NUM_GPUS':'1',
    'HF_MODEL_QUANTIZE':'bitsandbytes'
}

model = HuggingFaceModel(
    name=model_name,
    env=hub,
    role = role,
    image_uri=image_uri
)

In [16]:
model

<sagemaker.huggingface.model.HuggingFaceModel at 0x1d58b030fd0>

In [17]:
instance_type = "ml.g5.4xlarge"
#Other supported 1-GPU instance types: ml.g5.2xlarge, ml.g5.xlarge
#With instances with more GPUs, change the SM_NUM_GPUS value within the hub variable
#For example, using endpoint instance ml.g5.12xlarge, specify 'SM_NUM_GPUS':'4'


llm_predictor = model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  endpoint_name=model_name
)

----------!

In [18]:
llm_predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'falcon-7b--2023-08-12-13-14-29'

In [19]:
predictor_name = llm_predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [20]:
deployed_llm_model = sagemaker.predictor.RealTimePredictor(
    endpoint_name=predictor_name)

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [21]:
deployed_llm_model

<sagemaker.deprecations.deprecated_class.<locals>.DeprecatedClass at 0x1d5a22282e0>

In [22]:
# define payload
prompt = """You are an helpful Assistant, called Falcon. Knowing everyting about AWS.

User: Can you tell me something about Amazon SageMaker?
Falcon:"""

# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:","<|endoftext|>","</s>"]
  }
}

# send request to endpoint
response = llm_predictor.predict(payload)

# print assistant respond
assistant = response[0]["generated_text"][len(prompt):]

In [23]:
assistant

' You have the possibility to train and test your AI models in the cloud. You can choose between pre-trained models or your own models.\nUser:'

In [24]:
llm_predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'falcon-7b--2023-08-12-13-14-29'

In [25]:
def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type="application/json"):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json
    )
    return response

In [29]:
endpoint_name = 'falcon-7b--2023-08-12-13-14-29'

In [30]:
import json

In [35]:
response2 = query_endpoint_with_json_payload(json.dumps(payload).encode("utf-8"), endpoint_name)

In [36]:
model_predictions = json.loads(response2["Body"].read())


[{'generated_text': 'You are an helpful Assistant, called Falcon. Knowing everyting about AWS.\n\nUser: Can you tell me something about Amazon SageMaker?\nFalcon: Amazon SageMaker is a service that makes it easy to build, train, and deploy machine learning models. You can use SageMaker in the AWS Management Console, or using the API, AWS SDKs, and command-line tools.\nUser:'}]

In [39]:
model_predictions[0]['generated_text'][len(prompt):]

' Amazon SageMaker is a service that makes it easy to build, train, and deploy machine learning models. You can use SageMaker in the AWS Management Console, or using the API, AWS SDKs, and command-line tools.\nUser:'

In [None]:
llm_predictor.delete_model()
llm_predictor.delete_endpoint()