# Conversational Search

In [None]:
!pip install ipywidgets==8.1.0 --quiet
!pip install --upgrade sagemaker --quiet

In [None]:
!pip install langchain --quiet

In [None]:
import sagemaker, boto3, json
from sagemaker.session import Session
from ipywidgets import Dropdown

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()

## Deploy embedding model

In [None]:
embedding_model_id, embedding_model_version = (
    "huggingface-textembedding-gpt-j-6b-fp16",
    "*",
)

In [None]:
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base


embedding_endpoint_name = name_from_base(f"RAG-embedding-{embedding_model_id}")

embedding_instance_type = "ml.g5.2xlarge"

# Retrieve the inference docker container uri. This is the base HuggingFace container image for the default model above.

embedding_deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,  # automatically inferred from model_id
    image_scope="inference",
    model_id=embedding_model_id,
    model_version=embedding_model_version,
    instance_type=embedding_instance_type,
)

# Retrieve the model uri.
embedding_model_uri = model_uris.retrieve(
    model_id=embedding_model_id, model_version=embedding_model_version, model_scope="inference"
)


embedding_model = Model(
    image_uri=embedding_deploy_image_uri,
    model_data=embedding_model_uri,
    role=aws_role,
    predictor_cls=Predictor,
    name=embedding_endpoint_name,
)

# deploy the Model. Note that we need to pass Predictor class when we deploy model through Model class,
# for being able to run inference through the sagemaker API.
embedding_model_predictor = embedding_model.deploy(
    initial_instance_count=1,
    instance_type=embedding_instance_type,
    predictor_cls=Predictor,
    endpoint_name=embedding_endpoint_name,
    wait=False
)

## Deploy content genration model

In [None]:
llm_model_id, llm_model_version, = (
    "huggingface-llm-falcon-7b-instruct-bf16",
    "*",
)

llm_model_ids = ['huggingface-llm-falcon-40b-bf16',
             'huggingface-llm-falcon-40b-instruct-bf16',
             'huggingface-llm-falcon-7b-bf16',
             'huggingface-llm-falcon-7b-instruct-bf16']

# display the model-ids in a dropdown to select a model for inference.
model_dropdown = Dropdown(
    options=llm_model_ids,
    value=llm_model_id,
    description="Select a model",
    style={"description_width": "initial"},
    layout={"width": "max-content"},
)
display(model_dropdown)

In [None]:
llm_model_id = model_dropdown.value
print(llm_model_id)

In [None]:
llm_endpoint_name = name_from_base(f"RAG-LLM-{llm_model_id}")

llm_inference_instance_type = "ml.g5.4xlarge"

health_check_timeout = 1800

In [None]:
from sagemaker.jumpstart.model import JumpStartModel

llm_model = JumpStartModel(model_id=llm_model_id, instance_type=llm_inference_instance_type)
llm_model.env['SM_NUM_GPUS'] = '1'
llm_model_predictor = llm_model.deploy(
    endpoint_name=llm_endpoint_name,
    container_startup_health_check_timeout=health_check_timeout,
    wait=False)

## deploy with more code

In [None]:
number_of_gpu = 4
max_input_length = 1024
max_total_tokens = 2048

model_env = {
    'HF_MODEL_ID': "tiiuae/falcon-40b-instruct",
    'SM_NUM_GPUS': json.dumps(number_of_gpu),
    'MAX_INPUT_LENGTH': json.dumps(max_input_length),
    'MAX_TOTAL_TOKENS': json.dumps(max_total_tokens),
}

llm_endpoint_name = name_from_base(f"RAG-LLM-{llm_model_id}")

llm_deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,  # automatically inferred from model_id
    image_scope="inference",
    model_id=llm_model_id,
    model_version=llm_model_version,
    instance_type=llm_inference_instance_type,
)

# Retrieve the model uri.
llm_model_uri = model_uris.retrieve(
    model_id=llm_model_id, model_version=llm_model_version, model_scope="inference"
)


llm_model = Model(
    image_uri=llm_deploy_image_uri,
    model_data=llm_model_uri,
    role=aws_role,
    predictor_cls=Predictor,
    name=llm_endpoint_name,
    env=model_env
)

# deploy the Model. Note that we need to pass Predictor class when we deploy model through Model class,
# for being able to run inference through the sagemaker API.
llm_model_predictor = llm_model.deploy(
    initial_instance_count=1,
    instance_type=llm_inference_instance_type,
    predictor_cls=Predictor,
    endpoint_name=llm_endpoint_name,
    container_startup_health_check_timeout=health_check_timeout,
    wait=False
)

## Get deployed endpoint for embedding and content generation model

### Get endpoint for embedding

In [None]:
print(embedding_endpoint_name)

In [None]:
import time

sm_client = boto3.client("sagemaker", aws_region)

describe_embedding_endpoint_response = sm_client.describe_endpoint(EndpointName=embedding_endpoint_name)

while describe_embedding_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_embedding_endpoint_response = sm_client.describe_endpoint(EndpointName=embedding_endpoint_name)
print('enmbedding endpoint created')

### Get endpoint for content generation

In [None]:
#llm_endpoint_name='RAG-LLM-huggingface-llm-falcon-7b-instr-2023-08-20-12-32-31-953'
print(llm_endpoint_name)

In [None]:
sm_client = boto3.client("sagemaker", aws_region)

describe_llm_endpoint_response = sm_client.describe_endpoint(EndpointName=llm_endpoint_name)

while describe_llm_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_llm_endpoint_response = sm_client.describe_endpoint(EndpointName=llm_endpoint_name)
print('LLM endpoint created')

## Test embedding endpoint

In [None]:
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.embeddings import SagemakerEndpointEmbeddings


class TestContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        print(response_json)
        embeddings = response_json["embedding"]
        if len(embeddings) == 1:
            return [embeddings[0]]
        return embeddings


test_content_handler = TestContentHandler()

test_embeddings = SagemakerEndpointEmbeddings(
    endpoint_name=embedding_endpoint_name,
    region_name=aws_region,
    content_handler=test_content_handler,
)

In [None]:
print(test_embeddings.embed_documents(["Hello World"])[0][:5])

## Test LLM endpoint

In [None]:
def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type="application/json"):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json
    )
    return response

#method used to parse the inference model's response. we pass it as part of the model's config
def parse_response_model(query_response):
    model_predictions = json.loads(query_response["Body"].read())
    return [gen["generated_text"] for gen in model_predictions]


In [None]:
question = "Which instances can I use with Managed Spot Training in Amazon SageMaker?"

payload = {
    "inputs": question,
    "parameters":{
        "max_new_tokens": 100,
        "num_return_sequences": 1,
        "top_k": 50,
        "top_p": 0.95,
        "do_sample": False,
        "return_full_text": True,
        "temperature": 0.2
    }
}

query_response = query_endpoint_with_json_payload(
    json.dumps(payload).encode("utf-8"), endpoint_name=llm_endpoint_name
)

generated_texts = parse_response_model(query_response)

print(f"For model: {llm_endpoint_name}, \n\nThe generated output is: {generated_texts[0]}\n")

## Ingest documents

### Install dependecy package

In [None]:
!pip install PyPDF2

### Define LangChain embedding with SageMaker endpoint

In [None]:
from typing import List
import json
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler

class BulkSagemakerEndpointEmbeddings(SagemakerEndpointEmbeddings):
        def embed_documents(
            self, texts: List[str], chunk_size: int = 5
        ) -> List[List[float]]:
            """Compute doc embeddings using a SageMaker Inference Endpoint.

            Args:
                texts: The list of texts to embed.
                chunk_size: The chunk size defines how many input texts will
                    be grouped together as request. If None, will use the
                    chunk size specified by the class.

            Returns:
                List of embeddings, one for each text.
            """
            results = []
            _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size

            for i in range(0, len(texts), _chunk_size):
                response = self._embedding_func(texts[i:i + _chunk_size])
                results.extend(response)
            return results
        
class ContentHandler(EmbeddingsContentHandler):
        content_type = "application/json"
        accepts = "application/json"

        def transform_input(self, prompt: str, model_kwargs={}) -> bytes:

            input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
            return input_str.encode('utf-8') 

        def transform_output(self, output: bytes) -> str:

            response_json = json.loads(output.read().decode("utf-8"))
            embeddings = response_json["embedding"]
            if len(embeddings) == 1:
                return [embeddings[0]]
            return embeddings
        
embeddings = BulkSagemakerEndpointEmbeddings( 
            endpoint_name=embedding_endpoint_name,
            region_name=aws_region, 
            content_handler=ContentHandler())


### Convert PDF content into vector and store into OpenSearch

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import OpenSearchVectorSearch

#pdf_reader = PdfReader("opensearch-service-dg.pdf")

pdf_reader = PdfReader("c5-Fivetran.pdf")

text = ""
for page in pdf_reader.pages:
    text += page.extract_text()
    

    
text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
)
chunks = text_splitter.split_text(text)
print(len(chunks))

DOMAIN_ADMIN_UNAME = 'admin'
DOMAIN_ADMIN_PW = 'Awsadmin1!'
DOMAIN_ENDPOINT = 'search-opensearchservi-wl9zlhduvblq-t44uhffaksglcvhreaxgtbjufe.us-east-1.es.amazonaws.com'
os_domain_ep = 'https://'+DOMAIN_ENDPOINT

embedding_index_name = 'embed_test_vector_opensearch7'

docsearch = OpenSearchVectorSearch.from_texts(index_name = embedding_index_name,
                                                  texts=chunks,
                                       embedding=embeddings,
                                       opensearch_url=os_domain_ep,
                                       http_auth=(DOMAIN_ADMIN_UNAME, DOMAIN_ADMIN_PW)   )
    
print("docs inserted into opensearch")

### Test OpenSearch vector search

In [None]:
open_search_vector_store = OpenSearchVectorSearch(index_name=embedding_index_name,
                                       embedding_function=embeddings,
                                       opensearch_url=os_domain_ep,
                                       http_auth=(DOMAIN_ADMIN_UNAME, DOMAIN_ADMIN_PW)   ) 

docs_ = open_search_vector_store.similarity_search("Data Warehousing costs")
print("opensearch results:"+docs_[0].page_content)

## Retrieval Augmented Generation 

In [None]:
from uuid import uuid4
from typing import Dict
from langchain.memory import ConversationBufferMemory
from langchain.memory import DynamoDBChatMessageHistory
from langchain.memory import ConversationBufferWindowMemory
from langchain import PromptTemplate, SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain.chains import RetrievalQA


prompt_template = """Use the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""

prompt = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
)


class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_str = json.dumps({"inputs": prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        #response_json = json.loads(output.read().decode("utf-8"))
        print(output.read().decode("utf-8"))
        return output.read().decode("utf-8")


content_handler = ContentHandler()

llm_endpoint_name = llm_endpoint_name

llm=SagemakerEndpoint(
        endpoint_name=llm_endpoint_name,
        region_name="us-east-1",
        model_kwargs={"temperature": 1e-10},
        content_handler=content_handler,
)

session_id = str(uuid4())
chat_memory = DynamoDBChatMessageHistory(
        table_name="conversation-history-store",
        session_id=session_id
    )

messages = chat_memory.messages

# Maintains immutable sessions
# If previous session was present, create
# a new session and copy messages, and 
# generate a new session_id 
if messages:
    session_id = str(uuid4())
    chat_memory = DynamoDBChatMessageHistory(
        table_name="conversation-history-store",
        session_id=session_id
    )
    # This is a workaround at the moment. Ideally, this should
    # be added to the DynamoDBChatMessageHistory class
    try:
        messages = messages_to_dict(messages)
        chat_memory.table.put_item(
            Item={"SessionId": session_id, "History": messages}
        )
    except Exception as e:
        print(e)

memory = ConversationBufferMemory(chat_memory=chat_memory, return_messages=True)


qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=open_search_vector_store.as_retriever(),
    memory = memory
)





In [None]:
response = qa.run("Data Warehousing costs")

In [None]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(
    llm=llm,
    prompt=prompt,
)

chain({"input_documents": docs_, "question": "Data Warehousing costs"}, return_only_outputs=True)