In [None]:
%pip install langchain==0.0.245 unstructured tiktoken pypdf sagemaker --quiet
%pip install dependencies/botocore-1.31.21-py3-none-any.whl dependencies/boto3-1.28.21-py3-none-any.whl dependencies/awscli-1.29.21-py3-none-any.whl --force-reinstall --quiet
%pip install transformers==4.24.0 --quiet

<mark>Restart the kernel after the install to make use of the updated packages</mark>

The purpose of this notebook is to collect the following pieces of information from user guides for Samsung smart phones.  You can review the user manuals that will be used for this excercise in the manuals/ folder.

model_names: The model names covered by the manual.\
key_features: A summary of key features for the devices covered in the manual\
company_address: The address listed for Samsung\
document_summary: A summary of the document\
file: The file name for the manual\
stylus: whether or not the devices uses an "S Pen" stylus

Once this information is collected, it will be loaded into the database and queried in the remaining notebooks.

In [None]:
import langchain
langchain.__version__

In [None]:
# python libraries
import ast
import boto3
from datetime import datetime
import json
import os
import re
import time

# sagemaker libraries
from sagemaker.jumpstart.model import JumpStartModel

# langchain libraries
from langchain import PromptTemplate, LLMChain
from langchain.chains import SequentialChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.llms.sagemaker_endpoint import  SagemakerEndpoint, LLMContentHandler
from langchain.text_splitter import RecursiveCharacterTextSplitter

# bedrock libraries
from bedrock_utils import bedrock, print_ww
from langchain.llms.bedrock import Bedrock

# langsmith libraries
from langsmith import Client

ls_client = Client()
sm_client = boto3.client('sagemaker')

In [None]:
boto_session = boto3.session.Session()
region = boto_session.region_name

os.environ['AWS_DEFAULT_REGION'] = region

<mark>Optional but recommended: provide your langsmith API key</mark>

In [None]:
langsmith_api_key = ''

In [None]:
# langsmith env vars
today = datetime.now().strftime("%Y%m%d")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Manuals Extraction - {today}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = langsmith_api_key

In [None]:
boto3_bedrock = bedrock.get_bedrock_client(region=region,url_override=f'https://prod.{region}.frontend.bedrock.aws.dev')

<h1>Deploy SageMaker Endpoint</h1>

In [None]:
model_id = 'huggingface-llm-falcon-40b-instruct-bf16'
num_gpus = 8
instance_type = 'ml.g5.48xlarge'

config = {
  'SM_NUM_GPUS': json.dumps(num_gpus), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(10012),
  'MAX_BATCH_PREFILL_TOKENS': json.dumps(10024),
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(10024),
  'MAX_TOTAL_TOKENS': json.dumps(10024),  # Max length of the generation (including input text)
  'MAX_CONCURRENT_REQUESTS': json.dumps(1) # limit OOM errors https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-falcon.ipynb
}

In [None]:
my_model = JumpStartModel(model_id=model_id, env=config, instance_type=instance_type) 
predictor = my_model.deploy()

llm_falcon_endpoint = predictor.endpoint_name
llm_falcon_endpoint

<h1>Define Langchain LLMs</h1>

In [None]:
# Titan LLM
llm_titan = Bedrock(model_id="amazon.titan-tg1-large", 
              model_kwargs ={
                   "maxTokenCount": 4096,
                   "stopSequences": [],
                   "temperature":0,
                   "topP":1
                },
              client=boto3_bedrock)

In [None]:
# Falcon LLM
falcon_parameters = {
        "do_sample": True,
        "top_p": 0.9,
        "temperature": 0.01, 
        "max_new_tokens": 2000,
        "stop": ["<|endoftext|>", "</s>"]
    }

class FalconContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"inputs": prompt, "parameters": falcon_parameters, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        result = response_json[0]["generated_text"]

        return result

llm_falcon = SagemakerEndpoint(
    endpoint_name=llm_falcon_endpoint,
    region_name=region,
    model_kwargs=falcon_parameters,
    content_handler=FalconContentHandler(),
)

In [None]:
llm_titan("What day comes after Tuesday?")

In [None]:
llm_falcon("What day comes after Tuesday?")

<h1>Define "refine" QA Chains</h1>

QA Chains of type "refine" are themselves made up of two disinct chains: an "initial" chain and a "refine" chain.  

You can think of an LLMChain as consisting of two components:
- a language model
- a prompt or prompts

Each chain has an input_key (variable name for the text going into the prompt) and an output_key (variable name for the text coming out of the language model).

The general idea is that we want to divide our underlying document into chunks.  The first chunk of the document will be passed into the "initial" chain to request an initial answer.  All remaining chunks will be passed to the "refine" chain along with the current response.  This allows the language model to refine the answer as it sees more information.

![refine QA chain](images/refine-chain.drawio.png "Refine QA Chain")

![refine QA chain](images/refine-qa-process.drawio.png "Refine QA Process")

In [None]:
# we can start with a base QA chain of type "refine" to review its components
base_qa_chain = load_qa_chain(llm=llm_titan, chain_type='refine')

In [None]:
# let's review the "initial" chain's language model
base_qa_chain.initial_llm_chain.llm

In [None]:
# and the "initial" chain's default prompt
print(base_qa_chain.initial_llm_chain.prompt.template)

In [None]:
# now the "refine" chain's language model
base_qa_chain.refine_llm_chain.llm

In [None]:
# and the "refine" chain's default prompt
print(base_qa_chain.refine_llm_chain.prompt.template)

<h2>Define initial QA chains</h2>

We will define distinct refine QA chains to collect:
- models covered in each user manual
- key features of the models discussed in each manul
- the company address listed in the manuals

After we define these QA chains, we can optionally customize the individual chains (initial and refine) that make up its components

In [None]:
# qa chains
model_chain = load_qa_chain(llm=llm_titan, chain_type='refine', verbose=False)
features_chain = load_qa_chain(llm=llm_titan, chain_type='refine', verbose=False)
address_chain = load_qa_chain(llm=llm_titan, chain_type='refine', verbose=False)

<h2>Customize Prompts</h2>
Let's customize the prompts associated with the "initial" and "refine" chains to better reflect the information we are trying to collect.  We can:

- assign the language model a persona 
- update the {question} input field so that we can pass a specific input question to each prompt
- add additional instruction to the model to limit halucination

In [None]:
initial_model_prompt = PromptTemplate.from_template("""You are a tech professional who is an expert in smartphones.  Your task is to use the context provided to answer the question.
If you do not find any relevant information to answer the question in the provided context, return 'NA'.  Do NOT make up any information that is not in the context.
Only return information that is directly derived from the context.

Context information is below. 
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question: {model_question}""")

In [None]:
refine_model_prompt = PromptTemplate.from_template("""You are a tech professional who is an expert in smartphones.  Your task is to use the context provided to answer the question.
If you do not find any relevant information to answer the question in the provided context, return 'NA'.  Do NOT make up any information that is not in the context.
Only return information that is directly derived from the context.

The original question is as follows: {model_question}

We have provided an existing answer: {existing_answer}

We have the opportunity to refine the existing answer(only if needed) with some more context below.
------------
{context_str}
------------

Given the new context, refine the original answer to better answer the question. If the context isn't useful, return the original answer.
""")

In [None]:
initial_features_prompt = PromptTemplate.from_template("""You are a tech professional who is an expert in smartphones.  Your task is to use the context provided to answer the question.
If you do not find any relevant information to answer the question in the provided context, return 'NA'.  Do NOT make up any information that is not in the context.
Only return information that is directly derived from the context.

Context information is below. 
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question: {features_question}""")

In [None]:
refine_features_prompt = PromptTemplate.from_template("""You are a tech professional who is an expert in smartphones.  Your task is to use the context provided to answer the question.
If you do not find any relevant information to answer the question in the provided context, return 'NA'.  Do NOT make up any information that is not in the context.
Only return information that is directly derived from the context.

The original question is as follows: {features_question}

We have provided an existing answer: {existing_answer}

We have the opportunity to refine the existing answer(only if needed) with some more context below.
------------
{context_str}
------------

Given the new context, refine the original answer to better answer the question. If the context isn't useful, return the original answer.
""")

In [None]:
initial_address_prompt = PromptTemplate.from_template("""You are a helpful assistant who can help answer questions from the user.  Your task is to use the context provided to answer the question.
If you do not find any relevant information to answer the question in the provided context, return 'NA'.  Do NOT make up any information that is not in the context.

Return only the address and no special characters.

Context information is below. 
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question: {address_question}""")

In [None]:
refine_address_prompt = PromptTemplate.from_template("""You are a helpful assistant who can help answer questions from the user.  Your task is to use the context provided to answer the question.
If you do not find any relevant information to answer the question in the provided context, return 'NA'.  Do NOT make up any information that is not in the context. 

Return only the address and no special characters.

The original question is as follows: {address_question}

We have provided an existing answer: {existing_answer}

We have the opportunity to refine the existing answer(only if needed) with some more context below.
------------
{context_str}
------------

Given the new context, refine the original answer to better answer the question. If the context isn't useful, return the original answer.
""")

In [None]:
# Update the chains to use the newly created prompts
model_chain.initial_llm_chain.prompt = initial_model_prompt
model_chain.refine_llm_chain.prompt = refine_model_prompt

features_chain.initial_llm_chain.prompt = initial_features_prompt
features_chain.refine_llm_chain.prompt = refine_features_prompt

address_chain.initial_llm_chain.prompt = initial_address_prompt
address_chain.refine_llm_chain.prompt = refine_address_prompt

In [None]:
# we can review some of the changes that have been made
print(model_chain.initial_llm_chain.prompt.template)

<h2>Summary Chain</h2>


The goal of the summary chain is to generate summaries of the individual documents.  Similar to the "refine" QA chain, a summary chain of type "map reduce" starts by breaking the document into chunks.  Each chunk is sent to the summarization chain individual to produce a summary of the chunk.  Next, each of the summaries are sent to the summarization chain together to generate a "summary of the summaries".

As with the QA chains, we can customize the underling LLMChain to better meet our use case.

![summary chain](images/summary-chain.drawio.png "Summary Chain")

![summary chain process](images/summary-chain-process.drawio.png "Summary Chain Process")

In [None]:
# initiate a summary chain
summary_chain = load_summarize_chain(llm=llm_falcon, chain_type="map_reduce", verbose=False)

In [None]:
# review the default prompt template
print(summary_chain.llm_chain.prompt.template)

In [None]:
# customize the summary chain prompt template
summary_chain_prompt = PromptTemplate.from_template("""You are a tech professional who is an expert in smartphones.
The context provided is a portion from a smartphone's user manual.

If you do not find any relevant information to answer the question in the provided context, return 'NA'.  
Only return information that is directly derived from the context.

Write a very concise summary of the device described in the following section under the header "Context:".
Please focus on the screen, camera, and battery life.

Context:
"{text}"

CONCISE SUMMARY:""")

summary_chain.llm_chain.prompt = summary_chain_prompt

print(summary_chain.llm_chain.prompt.template)

<h2>Collection Chain</h2>

So far, we have the following chains to extract entities from our user manuals:

- QA chain to collect the model names
- QA chain to collect key features for the models in the manual
- QA chain to collect the company address
- Summary chain to generate a summary of the device

Each of these chains will produce their own output, but what we really want is to consolidate all of the results into a single JSON object. 

In [None]:
# First, asign the output of each underlying chain an explicit name so that we can process its output
model_chain.output_key = 'Model Names'
features_chain.output_key = 'Key Features'
address_chain.output_key = 'Company Address'
summary_chain.output_key = 'Document Summary'

In [None]:
# Next, we create a prompt template that expects the outputs of each chain as input
collect_template = """Here is information about a set of consumer electronic devices. 

Please respond by formatting this information in the specific schema described below.

The response MUST be a dictionary with the following schema:
{{
    'model_names': list  // List of the models covered
    'key_features': list  // List of key features of the models
    'company_address': string  // Address for the company
    'document_summary': string  // Summary of the document
}}

Respond ONLY with the json object and do not include any additional explanation.  Do NOT include any detail that is not privided in the context. 

---
Example #1:

Input:
Here are the names of the devices:
Model Name 1, Model Name 2, Model Name 3

Here are some key features of the devices:
- touch screen
- water proof
- fast charge

Here is the Compnay Address:
1234 Anywhere Lane, This Town, MO 64118

Here is a summary of the document:
This user manual provides instructions on how to use the HTC Model Name 1, Model Name 2, Model Name 3 phones, including sending texts, and using the apps.

Response:
{{
    'model_names': ['Model Name 1', 'Model Name 2', 'Model Name 3'],
    'key_features': ['touch screen','water proof','fast charge'],
    'company_address': '1234 Anywhere Lane, This Town, MO 64118',
    'document_summary': 'This user manual provides instructions on how to use the HTC Model Name 1, Model Name 2, Model Name 3 phones, including sending texts, and using the apps.'
}}
---

Begin!

Input:
Here are the names of the models:
{Model Names}

Here are some key features of the models:
{Key Features}

Here is the Compnay Address:
{Company Address}

Here is a summary of the document:
{Document Summary}

Response:"""

#prompt_template = PromptTemplate(input_variables=["Model Names", "Key Features", "Company Address", "Document Summary"], template=collect_template, partial_variables={"format_instructions": format_instructions})
collect_prompt_template = PromptTemplate(input_variables=["Model Names", "Key Features", "Company Address", "Document Summary"], template=collect_template)


In [None]:
# Finally, create an LLM chain that uses this prompt template
collection_chain = LLMChain(llm=llm_falcon, prompt=collect_prompt_template, output_key='collection') 

<h2>Sequential Chain</h2>

Now it is time to tie everything together.  To begin, let's take a step to limit the number of calls that are made to our language models.  This will help to make this demo a little bit faster and more lightweight.  Instead of passing every chunk from every document, we're going to cheat a little bit by creating two groups.  More broadly, we have the ability to pass specific subsets of the underlying document into specific chains.

- begin_documents: The first N chunks from the document.  This will serve as proxy for the entire document as a means to limit calls to the language model
- address_documents: When searching for the address of the company, we know that the chunk of text will include the string "address".  So we create this group as a subset of all chunks that include the string "address" - again as a means to reduce the calls to the endpoint for the purpose of this demo.

![sequential chain](images/collection-chain.drawio.png "Sequential Chain")

In [None]:
# update the input_key for each chain to expect the correct input group
model_chain.input_key = 'begin_documents'
features_chain.input_key = 'begin_documents'
address_chain.input_key = 'address_documents'
summary_chain.input_key = 'begin_documents'

In [None]:
# Now, create a sequential chain that combines all of the chains we have defined.  
sequential_chain = SequentialChain(
    chains=[model_chain, features_chain, address_chain, summary_chain, collection_chain], 
    input_variables=["begin_documents","address_documents",'model_question','features_question','address_question'],
    output_variables=['collection'],
    verbose=False)

In [None]:
# Define questions to pass into each of our QA chains
model_question = "List the model names covered by this manual."
features_question = """List the top 5 key features for this device.  Do NOT return more than 5 features."""
address_question = """What is the address in the context provided?  Please format your asnwer as:
Street, City, State Zip Code"""

In [None]:
# define a text splitter
recursive_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2500, chunk_overlap=500, separators=[" ", ",", "\n"]
    )

In [None]:
# define a function to process each manual
def process_file(file):
    # load PDF
    loader = PyPDFLoader(f"manuals/{file}")

    # split into chunks
    texts = loader.load_and_split(text_splitter=recursive_text_splitter)

    # define input
    #address_texts = [t for t in texts if 'Address:' in t.page_content]
    #address_texts = texts[-2:] # cheating again - we know the address is at the bottom of the document
    begin_texts = texts[:2]

    # replace newline characters
    for i in range(len(texts)):
        texts[i].page_content = texts[i].page_content.replace('\n', '')

    address_texts = [t for t in texts if 'Address:' in t.page_content]


    # perform a simple string search to confirm whether there is discussion of an S Pen in the manual
    stylus_texts = [t for t in texts if 'S Pen' in t.page_content]
    if len(stylus_texts) == 0:
        stylus = False
    else:
        stylus = True

    # process file
    sequential_result = sequential_chain({"begin_documents": begin_texts, 
                                      "address_documents": address_texts,
                                      "model_question": model_question, 
                                      "features_question":features_question,  
                                      "address_question":address_question,
                                     },
                                     return_only_outputs=True)

    # convert to dictionary
    response = sequential_result['collection'].strip()
    response_dict = ast.literal_eval(response)

    # add filename to the dictionary
    response_dict['file'] = file

    # add sylus boolean
    response_dict['stylus'] = stylus

    # write to disk
    with open('manual_metadata.jsonl', 'a') as file:
        file.write(json.dumps(response_dict))
        file.write('\n')

In [None]:
# collect input files
dir_list = os.listdir('manuals/')

In [None]:
langchain.debug=False # toggle for detailed logging

for file in dir_list:
    try:
        print(f'Begin processing {file}')
        process_file(file)
        print(f'Successfully processed {file}')
    except Exception as e:
        print(f'Error processing {file}')
        print(e)
    if file != dir_list[-1]:
        time.sleep(60*2) # avoid throttling

<h2>Cleanup</h2>

In [None]:
sm_client.delete_endpoint(
    EndpointName=llm_falcon_endpoint
)