# Retrieval augmented generation (RAG) chatbot

## Setup environment
Select the _PyTorch 2.0.0 Python 3.10 CPU Optimized_ image for this notebook.

![](../static/img/notebook-image-kernel.png)

In [None]:
!pip install sagemaker boto3 --upgrade --quiet

In [None]:
# Restart kernel to get the packages
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
import sagemaker
import boto3
import os
import json

sagemaker.__version__

In [None]:
# Get some variables you need to interact with SageMaker service
boto_session = boto3.Session()
region = boto_session.region_name
bucket_name = sagemaker.Session().default_bucket()
bucket_prefix = "genai-rag-workshop/knowledge-base"  
sm_session = sagemaker.Session()
sm_client = boto_session.client("sagemaker")
sm_role = sagemaker.get_execution_role()
account_id = boto3.client("sts").get_caller_identity()["Account"]

In [None]:
# Get domain id and user profile
NOTEBOOK_METADATA_FILE = "/opt/ml/metadata/resource-metadata.json"
domain_id = None

if os.path.exists(NOTEBOOK_METADATA_FILE):
    with open(NOTEBOOK_METADATA_FILE, "rb") as f:
        md = json.loads(f.read())
        domain_id = md.get('DomainId')
        user_profile_name = md.get('UserProfileName')
        
        print(f"SageMaker domain id: {domain_id}\n"
              f"User profile name: {user_profile_name}")

In [None]:
%store domain_id
%store user_profile_name
%store region
%store account_id
%store bucket_prefix

%store

### Check quotas

In [None]:
sm_role

In [None]:
quotas_client = boto3.client("service-quotas")
llm_instance_type = "ml.g5.12xlarge"
                      
quotas = {
    "ml.g5.12xlarge": "L-65C4BD00",
    "ml.g5.48xlarge": "L-0100B823",
    "ml.g4dn.xlarge": "L-B67CFA0C",
}
                      
response = quotas_client.get_service_quota(
    ServiceCode="sagemaker",
    QuotaCode=quotas[llm_instance_type],
)
                      
if response["Quota"]["Value"] == 0.0:
    raise (
        f"Please adjust your quota for the LLM Endpoint for type {llm_instance_type}"
    )
else:
    print(
        f"You have {response['Quota']['Value']} instances quotas for the LLM Endpoint of type {llm_instance_type}"
    )

## Host LLM

In [None]:
# Import HuggingFace classes
from sagemaker.huggingface import (
    get_huggingface_llm_image_uri, 
    HuggingFaceModel, 
    HuggingFacePredictor
)

### Option 1: Deploy using SageMaker JumpStart
The easiest option to deploy an LLM is to use [`JumpStartModel`](https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.jumpstart.model.JumpStartModel) Python SDK class. Refer to [Introduction to SageMaker JumpStart - Text Generation with Falcon models](https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-falcon.ipynb) for an example.

In [None]:
from sagemaker.jumpstart.model import JumpStartModel

model_id = "huggingface-llm-falcon-40b-instruct-bf16"
endpoint_name = f"{model_id.split('/')[-1].replace('.', '-')}-{user_profile_name}"

model = JumpStartModel(model_id=model_id)



In [None]:
predictor = model.deploy(endpoint_name=endpoint_name)



Move to the **Test the LLM endpoint** section.

### Option 2: Deploy using HuggingFace TGI

<div class="alert alert-info"> 💡 
Don't deploy the second endpoint if you've already deployed using option 1.
</div>

If you'd like to use a model which is not onboarded to JumpStart, you can use a HuggingFace container.

[HuggingFace LLM DLC](https://huggingface.co/blog/sagemaker-huggingface-llm) is a new purpose-built Inference Container to easily deploy LLMs in a secure and managed environment. The DLC is powered by Text Generation Inference (TGI), an open-source, purpose-built solution for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation using Tensor Parallelism and dynamic batching for the most popular open-source LLMs, including StarCoder, BLOOM, GPT-NeoX, Llama, and T5. Text Generation Inference is already used by customers such as IBM, Grammarly, and the Open-Assistant initiative implements optimization for all supported model architectures

We're going to use [Falcon-40B-Instruct](https://huggingface.co/tiiuae/falcon-40b-instruct) LLM, but feel free to try out any other LLM of your choice.

In [None]:
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri("huggingface", version="0.8.2")

# print ecr image uri
print(f"llm image uri: {llm_image}")

In [None]:
# sagemaker config
instance_type = llm_instance_type
number_of_gpu = 4
health_check_timeout = 1200
model_id = "tiiuae/falcon-40b-instruct"
endpoint_name = f"{model_id.split('/')[-1].replace('.', '-')}-{user_profile_name}"

# TGI config
config = {
    'HF_MODEL_ID': model_id, # model_id from hf.co/models
    'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replicaabs
    'HF_MODEL_REVISION': '1e7fdcc9f45d13704f3826e99937917e007cd975',
    'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# create HuggingFaceModel
llm_model = HuggingFaceModel(
  role=sm_role,
  image_uri=llm_image,
  env=config
)



In [None]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
predictor = llm_model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout, # 20 minutes to be able to load the model
    endpoint_name=endpoint_name,
    )



### Test the LLM endpoint

In [None]:
endpoint_name

In [None]:
# Get the predictor from the existing endpoint
try:
    predictor
except NameError:
    predictor = HuggingFacePredictor(endpoint_name=endpoint_name)

In [None]:
endpoint_input = {'inputs': 'Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:', 'parameters': {'max_new_tokens': 50, 'top_k': 10, 'return_full_text': False, 'do_sample': True}}

response = predictor.predict(endpoint_input)
print(f"Inference:\nInput: {endpoint_input}\nResponse: {response}\n")

## Use Amazon Bedrock to access LLM
You can use Amazon Bedrock to access various LLM via a single API

In [None]:
assert(boto3.__version__ >= '1.28.57')

In [None]:
bedrock = boto3.client(service_name='bedrock')

bedrock.list_foundation_models()["modelSummaries"]

In [None]:
llm = bedrock.get_foundation_model(modelIdentifier='anthropic.claude-v2')

In [None]:
bedrock_runtime = boto3.client(service_name='bedrock-runtime')
body = json.dumps({
    "prompt": "\n\nHuman:explain black holes to 8th graders\n\nAssistant:",
    "max_tokens_to_sample": 300,
    "temperature": 0.1,
    "top_p": 0.9,
})

modelId = 'anthropic.claude-v2'
accept = 'application/json'
contentType = 'application/json'

response = bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)

response_body = json.loads(response.get('body').read())
# text
print(response_body.get('completion'))

## Clean up

In [None]:
# delete endpoints
predictor.delete_endpoint()