# Building Knowledge-Powered Question Answering Applications using Amazon SageMaker and LlamaIndex
When creating LLM applications, customers in many scenarios will need to connect and query external data sources to use those as part of the prompt to the model
What is Llama Index.

[LlamaIndex](https://www.llamaindex.ai/) is a “data framework” to help you build LLM apps. It provides tools that offers data connectors to ingest your existing data sources and data formats (APIs, PDFs, docs, SQL, etc.). For data stored in various sources and formats such as databases, or in PDFs, LlamaIndex makes it easy to bring that data into use for LLMs. The LLmaIndex APIs makes the data access easier and enables the user to create powerful custom LLM applications and workflows. 

This notebook has been tested on Data Science 3.0 Kernel.

**In this notebook:**

1- install dependencies

2- deploy an embedding model using SageMaker Jumpstart 

3- deploy an LLM using SageMaker Jumpstart 

4- ingest PDF files and build index using LlamaIndex

5- build a RAG using LlamaIndex query engine

6- build a RAG using Llamaindex and Langchain agents

Clean up




# 1- Install dependencies

In [None]:
!pip install langchain==0.1.0 llama_index pypdf

In [None]:
!pip install sagemaker --quiet --upgrade --force-reinstall


In [None]:
!pip install llama-index-embeddings-langchain
!pip install llama-index-llms-langchain

#### Load Widgets used across the notebook

In [None]:
import sagemaker
import boto3
import json
import time
sess = sagemaker.Session()
bucket = sess.default_bucket()  
aws_region = boto3.Session().region_name

In [None]:
import boto3
import botocore

In [None]:
import langchain
import llama_index
print(langchain.__version__)

# 2- Deploy the embedding model on SageMaker Jumpstart

In [None]:
#run only if you haven't used the Jumpstart UI to deploy the gpt-j embedding model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.model import Model
from sagemaker.predictor import Predictor
import sagemaker

role =sagemaker.get_execution_role()
instance_type = "ml.g5.2xlarge" # instance type to use for deployment
model_version = "*"
env= {
            "SAGEMAKER_MODEL_SERVER_WORKERS": "1", 
            "TS_DEFAULT_WORKERS_PER_MODEL": "1"
    }
role

In [None]:
model_id = "huggingface-textembedding-gpt-j-6b-fp16"
# Retrieve the model uri.
model_uri = model_uris.retrieve(
    model_id=model_id, model_version=model_version, model_scope="inference"
)
embed_endpoint_name = model_id

In [None]:
# Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,  # automatically inferred from model_id
    image_scope="inference",
    model_id=model_id,
    model_version=model_version,
    instance_type=instance_type,
)

model_inference = Model(
    image_uri= deploy_image_uri,
    model_data=model_uri,
    role=role,
    predictor_cls=Predictor,
    name=model_id,
    env=env,
)
model_predictor_inference = model_inference.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    predictor_cls=Predictor,
    endpoint_name=embed_endpoint_name,
)
print(f"Model {model_id} has been deployed successfully.")

### Define handler class to use LangChian with SageMaker hosted endpoints

In [None]:
from typing import Dict, List
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
import json


class ContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, inputs: List[str], model_kwargs: Dict) -> bytes:
        input_str = json.dumps({"text_inputs": inputs, **model_kwargs})
        return input_str.encode('utf-8')

    def transform_output(self, output: bytes) -> List[List[float]]:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json["embedding"]

emb_content_handler = ContentHandler()


embeddings = SagemakerEndpointEmbeddings(
    endpoint_name='huggingface-textembedding-gpt-j-6b-fp16',
    region_name= aws_region,
    content_handler=emb_content_handler,
)

In [None]:
text = "Hi! It's time for the beach"

text_embedding = embeddings.embed_query(text)
print (f"Your embedding is length {len(text_embedding)}")
print (f"Here's a sample: {text_embedding[:5]}...")

In [None]:
doc_embedding = embeddings.embed_documents([text])
print (f"Your embedding is length {len(doc_embedding[0])}")
print (f"Here's a sample: {doc_embedding[0][:5]}...")

# 3- Deploy llama2-Chat from the SageMaker Jumpstart and define handler for using the hosted endpoint with LangChain

In [None]:
import sagemaker
import json
role = sagemaker.get_execution_role()

from sagemaker.jumpstart.model import JumpStartModel

model_id, model_version = "meta-textgeneration-llama-2-70b-f", "2.*"
endpoint_name = model_id


In [None]:
my_model = JumpStartModel(model_id = model_id, 
                          model_version = model_version,)
predictor = my_model.deploy(endpoint_name = endpoint_name, accept_eula=True)

In [None]:
def print_dialog(payload, response):
    dialog = payload["inputs"][0]
    for msg in dialog:
        print(f"{msg['role'].capitalize()}: {msg['content']}\n")
    print(f"> {response[0]['generation']['role'].capitalize()}: {response[0]['generation']['content']}")
    print("\n==================================\n")

In [None]:
payload = {
    "inputs": [[
        {"role": "user", "content": "what is the recipe of mayonnaise?"},
    ]],
    "parameters": {"max_new_tokens": 512, "top_p": 0.9, "temperature": 0.6}
}
response = predictor.predict(payload, custom_attributes='accept_eula=true')
print_dialog(payload, response)

In [None]:
system_prompt = """You are a helpful assistant. Always answer to questions as helpfully as possible. If you don't know the answer to a question, say I don't know the answer"""


In [None]:
from typing import Dict
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
import json

class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"


    def transform_input(self, prompt: str, model_kwargs: dict) -> bytes:
            payload = {
                "inputs": [
                    [
                        {
                            "role": "system",
                            "content": system_prompt,
                        },
                        {"role": "user", "content": prompt},
                    ],
                ],
                "parameters": model_kwargs,
            }
            input_str = json.dumps(
                payload,
            )
            return input_str.encode("utf-8")
   
    def transform_output(self, output: bytes) -> str:
            response_json = json.loads(output.read().decode("utf-8"))
            content = response_json[0]["generation"]["content"]
            return content
        
content_handler = ContentHandler()




llm=SagemakerEndpoint(
        endpoint_name = endpoint_name,
        region_name = aws_region,
        model_kwargs = {"max_new_tokens":500, "top_p": 0.1, "temperature": 0.4, "return_full_text": False},
        content_handler = content_handler,
        endpoint_kwargs = {"CustomAttributes": "accept_eula=true"}
    )
    
llm("what is amazon sagemaker?")

# 4- Use LlamaIndex to ingest pdfs and build the index

In [None]:
import pypdf
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


docs = SimpleDirectoryReader(input_dir="pressrelease").load_data()
print(f"Loaded {len(docs)} docs")
#docs

# 5- Simple RAG using LlamaIndex Query Engine

In [None]:
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.langchain import LangChainLLM

Settings.embed_model = LangchainEmbedding(embeddings)
Settings.llm = LangChainLLM(llm) 
   

In [None]:
index = VectorStoreIndex.from_documents(docs)
query_engine = index.as_query_engine() #similarity_top_k=2
print(query_engine.query("Since migrating to AWS in May, how much in operational cost Yellow.ai has reduced?"))

# 6- Use together with an agent in Langchain

In [None]:
from langchain.agents import Tool, AgentType, initialize_agent
from llama_index.core.langchain_helpers.agents import (
    IndexToolConfig,
    LlamaIndexTool,
)


tools = [
    Tool(
        name="Pressrelease",
        func=lambda q: str(index.as_query_engine().query(q)),
        description="useful pressreleases for answering relevnat questions",
        return_direct=True,
    ),
]


In [None]:
from langchain.agents import load_tools, initialize_agent

agent= initialize_agent(tools, llm, agent="chat-zero-shot-react-description", verbose=True)

In [None]:
agent.run("Since migrating to AWS in May, how much in operational cost Yellow.ai has reduced?")

# Clean up



In [None]:
client = boto3.client('sagemaker', region_name=aws_region)

#delete embedding endpoint
client.delete_endpoint_config(EndpointConfigName=embed_endpoint_name)
client.delete_endpoint(EndpointName=embed_endpoint_name)

# #delete LLM endpoint
client.delete_endpoint_config(EndpointConfigName=endpoint_name)
client.delete_endpoint(EndpointName=endpoint_name)
