In [None]:
import json
import boto3
import sagemaker
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

In [None]:
session = sagemaker.Session()
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and session is not None:
    sagemaker_session_bucket = session.default_bucket()

In [None]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

session = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
llm_image = get_huggingface_llm_image_uri(
  framework="huggingface",
  version="0.8.2"
)

In [None]:
print(f"llm image uri: {llm_image}")

In [None]:
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4

In [None]:
config = {
  'HF_MODEL_ID': "tiiuae/falcon-40b-instruct", 
  'SM_NUM_GPUS': json.dumps(number_of_gpu), 
  'MAX_INPUT_LENGTH': json.dumps(1024),  
  'MAX_TOTAL_TOKENS': json.dumps(2048),  
  # 'HF_MODEL_QUANTIZE': "bitsandbytes" 
}

In [None]:
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [None]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  # volume_size=400
)

In [None]:
prompt = """
You are a helpful Assistant, called Falcon. You know everything about AWS.
User: Can you tell me something about Amazon SageMaker?
Falcon:
"""

In [None]:
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:", "<|endoftext|>", "</s>"]
  }
}

In [None]:
response = llm.predict(payload)

for seq in response:
    print(f"Result: {seq['generated_text']}")

In [None]:
prompt = """
Teplizumab traces its roots to a New Jersey drug company called Ortho Pharmaceutical. There, scientists generated an early version of the antibody, dubbed OKT3. Originally sourced from mice, the molecule was able to bind to the surface of T cells and limit their cell-killing potential. In 1986, it was approved to help prevent organ rejection after kidney transplants, making it the first therapeutic antibody approved for human use.
User: What was OKT3 originally sourced from?
Falcon:
"""

In [None]:
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:", "<|endoftext|>", "</s>"]
  }
}

In [None]:
response = llm.predict(payload)

for seq in response:
    print(f"Result: {seq['generated_text']}")

In [None]:
prompt = """
Tweet: "This new music video was incredible."
Sentiment:
"""

In [None]:
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:", "<|endoftext|>", "</s>"]
  }
}

In [None]:
response = llm.predict(payload)

for seq in response:
    print(f"Result: {seq['generated_text']}")

In [None]:
prompt = """
Tweet: "I hate it when my phone battery dies."
Sentiment: Negative
###
Tweet: "My day has been 👍."
Sentiment: Positive
###
Tweet: "This is the link to the article."
Sentiment: Neutral
###
Tweet: "This new music video was incredible."
Sentiment:
"""

In [None]:
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:", "<|endoftext|>", "</s>"]
  }
}

In [None]:
response = llm.predict(payload)
for seq in response:
    print(f"Result: {seq['generated_text']}")

In [None]:
llm.delete_model()
llm.delete_endpoint()