# Large Language Model

Anna Hathaway

## Chunking the Text

Many use cases such as building a chatbot require text (text2text) generation models like BloomZ 7B1, Flan T5 XXL, and Flan T5 UL2 to respond to user questions with insightful answers. The BloomZ 7B1, Flan T5 XXL, and Flan T5 UL2 models have picked up a lot of general knowledge in training, but we need to ingest and use a large library of more Principal specific information.

This notebook demonstrates Flan T5 XL to answer questions using a text file about Principal health care, by using document embeddings and retrieval. The embeddings are generated from GPT-J-6B embedding model.

In [None]:
'''This notebook requires python 3.8 or better, and will not run on the base kernel,
there are important differences in the versions and langchain was built on 3.8 or greater syntax'''
#Package Installs
!pip install langchain
!pip install --upgrade sagemaker --quiet
!pip install ipywidgets==7.0.0 --quiet
!pip install langchain==0.0.148 --quiet
!pip install faiss-cpu --quiet
!pip install tiktoken 
#imports
import time
import sagemaker, boto3, json
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base
from typing import Any, Dict, List, Optional
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma, AtlasDB, FAISS
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders.csv_loader import CSVLoader
import tiktoken 
#define the roles
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
model_version = "*"

In [None]:
# merge multiple text files definition
def merge_text_files(file_paths):
    merged_string=""
    
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            file_content=file.read()
#            file_content=file_content.replace('\n','')
            merged_string += file_content
            
            
    return merged_string

In [None]:
# merging all files 
file_paths=['final_health.txt']
merged_text=merge_text_files(file_paths)
merged_text

## Using a Custom Dataset with Open-sourced LangChain Library

reference website: https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/question_answering_retrieval_augmented_generation/question_answering_langchain_jumpstart.ipynb

### Deploy large language model (LLM) and embedding model in SageMaker JumpStart

In [None]:
def query_endpoint_with_txt_payload(encoded_md, endpoint_name, content_type="text"):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_md
    )
    return response

def parse_response_model_flan_t5(query_response):
    model_predictions = md.loads(query_response["Body"].read())
    generated_text = model_predictions["generated_texts"]
    return generated_text

In [None]:
'''This code deploys two models in a row, each with a slightly different configuration'''
# Deploy SageMaker endpoint for large language model and GPT-J 6B embedding model
## if you want to deploy multiple LLM models to compare their performance.

_MODEL_CONFIG_ = {
    "huggingface-text2text-flan-t5-xl": {
        "instance type": "ml.g4dn.4xlarge",
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
        "parse_function": parse_response_model_flan_t5,
        "prompt": """Answer based on context:\n\n{context}\n\n{question}""",
    },
    "huggingface-textembedding-gpt-j-6b": {
        "instance type": "ml.g4dn.12xlarge",
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
    },
    # "huggingface-textgeneration1-bloomz-7b1-fp16": {
    #     "instance type": "ml.g5.12xlarge",
    #     "env": {},
    #     "parse_function": parse_response_multiple_texts_bloomz,
    #     "prompt": """question: \"{question}"\\n\nContext: \"{context}"\\n\nAnswer:""",
    # },
    # "huggingface-text2text-flan-ul2-bf16": {
    #     "instance type": "ml.g5.24xlarge",
    #     "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
    #     "parse_function": parse_response_model_flan_t5,
    #     "prompt": """Answer based on context:\n\n{context}\n\n{question}""",
    # },
}

In [None]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

for model_id in _MODEL_CONFIG_:
    endpoint_name = name_from_base(f"anna-{model_id}")
    inference_instance_type = _MODEL_CONFIG_[model_id]["instance type"]

    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
    deploy_image_uri = image_uris.retrieve(
        region=None,
        framework=None,  # automatically inferred from model_id
        image_scope="inference",
        model_id=model_id,
        model_version=model_version,
        instance_type=inference_instance_type,
    )
    # Retrieve the model uri.
    model_uri = model_uris.retrieve(
        model_id=model_id, model_version=model_version, model_scope="inference"
    )
    model_inference = Model(
        image_uri=deploy_image_uri,
        model_data=model_uri,
        role=aws_role,
        predictor_cls=Predictor,
        name=endpoint_name,
        env=_MODEL_CONFIG_[model_id]["env"],
    )
    model_predictor_inference = model_inference.deploy(
        initial_instance_count=1,
        instance_type=inference_instance_type,
        predictor_cls=Predictor,
        endpoint_name=endpoint_name,
    )
    print(f"{bold}Model {model_id} has been deployed successfully.{unbold}{newline}")
    _MODEL_CONFIG_[model_id]["endpoint_name"] = endpoint_name

In [None]:
'''This section configures the payload, initially it is one of the few areas where a data scientist can make adustments to the model'''
payload = {
    "text_inputs": question,
    "max_length": 100,
    "num_return_sequences": 1,
    "top_k": 50,
    "top_p": 0.95,
    "do_sample": True,
}

list_of_LLMs = list(_MODEL_CONFIG_.keys())
list_of_LLMs.remove("huggingface-textembedding-gpt-j-6b")  # remove the embedding model


for model_id in list_of_LLMs:
    endpoint_name = _MODEL_CONFIG_[model_id]["endpoint_name"]
    query_response = query_endpoint_with_json_payload(
        json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name
    )
    generated_texts = _MODEL_CONFIG_[model_id]["parse_function"](query_response)
    print(f"For model: {model_id}, the generated output is: {generated_texts[0]}\n")

In [None]:
# Improve the answer to the same question using prompt engineering with insightful context
context = """Managed Spot Training can be used with all instances supported in Amazon SageMaker. Managed Spot Training is supported in all AWS Regions where Amazon is supported."

In [None]:
parameters = {
    "max_length": 200,
    "num_return_sequences": 1,
    "top_k": 250,
    "top_p": 0.95,
    "do_sample": False,
    "temperature": 1,
}

for model_id in list_of_LLMs:
    endpoint_name = _MODEL_CONFIG_[model_id]["endpoint_name"]

    prompt = _MODEL_CONFIG_[model_id]["prompt"]

    text_input = prompt.replace("{context}", context)
    text_input = text_input.replace("{question}", question)
    payload = {"text_inputs": text_input, **parameters}

    query_response = query_endpoint_with_json_payload(
        json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name
    )
    generated_texts = _MODEL_CONFIG_[model_id]["parse_function"](query_response)
    print(
        f"{bold}For model: {model_id}, the generated output is: {generated_texts[0]}{unbold}{newline}"
    )

### Use RAG based approach with LangChain and SageMaker endpoints to build a simplified question and answering application.

Steps:
1. Generate embedings for each of document in the knowledge library with SageMaker GPT-J-6B embedding model.

2. Identify top K most relevant documents based on user query.

    2.1 For a query of your interest, generate the embedding of the query using the same embedding model.
    
    2.2 Search the indexes of top K most relevant documents in the embedding space using in-memory Faiss search.
    
    2.3 Use the indexes to retrieve the corresponded documents.
    
3. Combine the retrieved documents with prompt and question and send them into SageMaker LLM.

In [None]:
# Wrap up our SageMaker endpoints for embedding model into langchain.embeddings.SagemakerEndpointEmbeddings
class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings):
    def embed_documents(self, texts: List[str], chunk_size: int = 5) -> List[List[float]]:
        """Compute doc embeddings using a SageMaker Inference Endpoint.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size defines how many input texts will
                be grouped together as request. If None, will use the
                chunk size specified by the class.

        Returns:
            List of embeddings, one for each text.
        """
        results = []
        _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size

        for i in range(0, len(texts), _chunk_size):
            response = self._embedding_func(texts[i : i + _chunk_size])
            print
            results.extend(response)
        return results


class ContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        embeddings = response_json["embedding"]
        return embeddings


content_handler = ContentHandler()

embeddings = SagemakerEndpointEmbeddingsJumpStart(
    endpoint_name="anna-huggingface-textembedding-gpt-j-6b-2023-06-15-14-59-46-042",
    region_name=aws_region,
    content_handler=content_handler)

In [None]:
# Next, we wrap up our SageMaker endpoints for LLM into langchain.llms.sagemaker_endpoint.SagemakerEndpoint.

parameters = {
    "max_length": 100,
    "num_return_sequences": 1,
    "top_k": 250,
    "top_p": 0.95,
    "do_sample": False,
    "temperature": 1,
}


class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json["generated_texts"][0]


content_handler = ContentHandler()

sm_llm = SagemakerEndpoint(
    endpoint_name="anna-huggingface-text2text-flan-t5-xl-2023-06-15-14-11-52-712",
    region_name=aws_region,
    model_kwargs=parameters,
    content_handler=content_handler,
)

## Create the index that the LLM accesses for responses
### This is the section that gets into prompt engineering, the current iteration only accepts questions
#### Querying the LLM without this index will create poor responses

In [None]:
loader = TextLoader(file_path="final_health.txt")

In [None]:
# new code
documents = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n"]
)
texts = text_splitter.split_documents(documents)

In [None]:
# we can achieve the points in Step 4 with just a few lines of code as shown below.
index_creator = VectorstoreIndexCreator(
    vectorstore_cls=FAISS,
    embedding=embeddings,
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n"]
))

In [None]:
index = index_creator.from_loaders([loader])

In [None]:
question = "What is an HSA?"

In [None]:
index.query(question=question, llm=sm_llm)

### Step 5. Customize the QA application above with different prompt.

In [None]:
docsearch = FAISS.from_documents(documents, embeddings)

In [None]:
docs = docsearch.similarity_search(question, k=3)

In [None]:
# Finally, we combine the retrieved documents with prompt and question and send them into SageMaker LLM. We define a customized prompt as below.
prompt_template = """Answer based on context:\n\n{context}\n\n{question}"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [None]:
chain = load_qa_chain(llm=sm_llm, prompt=PROMPT)

In [None]:
#Send the top 3 most relevant docuemnts and question into LLM to get a answer.
result = chain({"input_documents": docs, "question": question}, return_only_outputs=True)[
    "output_text"
]

In [None]:
# Print the final answer from LLM as below, which is accurate.
result