In [25]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
import os 
from dotenv import load_dotenv
from time import time
import warnings
warnings.filterwarnings('ignore')

In [26]:
#loader = DirectoryLoader('PDF_Testing', glob="./*.pdf", loader_cls=PyPDFLoader)
loader = PyPDFLoader('NYSE_AXP_2021.pdf')
documents = loader.load()

In [27]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

In [28]:
len(texts)

4545

In [29]:
texts[5]

Document(page_content='x Delivered net income of $8.1 billion and earnings per share of $10.02.  \nx Achieved record levels of Card Member spending, with billed business increasing 25% to $1.09 trillion, led by strength', metadata={'source': 'NYSE_AXP_2021.pdf', 'page': 2})

In [30]:
_ = load_dotenv()

HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

llm=HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta", 
    model_kwargs={"temperature":0.2, "max_length":256},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
    )

In [31]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-base-en-v1.5",
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [32]:
%%time
persist_directory = 'db_HuggingFace'

embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: total: 9.97 s
Wall time: 25.3 s


In [33]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
docs = retriever.get_relevant_documents("What is paranoia?")

In [34]:
len(docs)

2

In [35]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [36]:
def process_llm_response(qa_chain, query):
    print(f"Query: {query}\n")
    time_1 = time()
    llm_response = qa_chain(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print(f"\nResult:", llm_response['result'])
    print(f"\nmetadata:", llm_response['source_documents'][0].metadata)

In [37]:
query = "Key highlights in 2021 report?"
process_llm_response(qa_chain, query)

Query: Key highlights in 2021 report?

Inference time: 7.449 sec.

Result:  According to the 2021 Annual Report, some key highlights from the past year include a 10% increase in revenue, the launch of a new product line, and the implementation of a sustainability initiative that resulted in a 25% reduction in carbon emissions. Additionally, the report notes that the company achieved record profits and returned a significant portion of those profits to shareholders through dividends and share buybacks. Overall, the report paints a positive picture of the company's performance in 2021.

metadata: {'page': 0, 'source': 'NYSE_AXP_2021.pdf'}


In [38]:
# break it down
query = "How many percentage change in card member spending?"
process_llm_response(qa_chain, query)

Query: How many percentage change in card member spending?

Inference time: 10.85 sec.

Result:  The text provides information about card member spending in 2020, 2019, and projected spending in 2021. To calculate the percentage change, you would subtract the spending amount in the previous year from the spending amount in the current year, divide by the spending amount in the previous year, and then multiply by 100. For example, to calculate the percentage change in card member spending from 2019 to 2020, you would subtract $22,021 from $26,906, divide by $22,021, and then multiply by 100. The result would be a negative number, indicating a decrease in spending.

However, the question asks for the percentage change in card member spending, but the provided context only shows the change in card member spending, not the actual spending amounts. Without this information, it is not possible to calculate the percentage change in card member spending.

In summary, the answer to the question

In [39]:
query = "How to engage card members?"
process_llm_response(qa_chain, query)

Query: How to engage card members?

Inference time: 5.571 sec.

Result:  To engage Card Members, American Express can offer special offers and services that cater to their preferences and are in line with their receptivity to such offers. This can be achieved through targeted marketing campaigns and personalized communication strategies that take into account the individual needs and preferences of each Card Member. By providing relevant and valuable benefits, American Express can strengthen the relationship with its Card Members and increase their loyalty and advocacy.

metadata: {'page': 10, 'source': 'NYSE_AXP_2021.pdf'}


In [40]:
query = "Examples of connecting partners with the integrated payments platform"
process_llm_response(qa_chain, query)

Query: Examples of connecting partners with the integrated payments platform

Inference time: 3.328 sec.

Result:  Yes, some examples of connecting partners with the integrated payments platform include issuing cards, forming partnership agreements with banks and other institutions, and exploring joint ventures and alternative payment solutions.

metadata: {'page': 12, 'source': 'NYSE_AXP_2021.pdf'}


In [41]:
query = "Key value of American Express?"
process_llm_response(qa_chain, query)

Query: Key value of American Express?

Inference time: 50.62 sec.

Result:  The key value of American Express is its reputation and network of merchants that accept its cards, which makes it a preferred payment method for many consumers and businesses. This network effect helps to differentiate American Express from other payment options and provides a competitive advantage. However, any disruption to this network, such as the loss of merchants or customer data, could negatively impact demand for and spending on American Express cards, which would affect the company's financial performance.

metadata: {'page': 80, 'source': 'NYSE_AXP_2021.pdf'}


In [42]:
query = "What Business stragies American Express focused in 2021?"
process_llm_response(qa_chain, query)

Query: What Business stragies American Express focused in 2021?

Inference time: 5.108 sec.

Result:  In 2021, American Express focused on two main business strategies. Firstly, they continued their strategy of expanding their digital cash management platforms through the acquisition of Kabbage's digital platforms in 2020. Secondly, they introduced a new product called American Express, which is a digital cash management platform. Additionally, 90% of colleagues who participated in a survey in 2021 said they would recommend working at American Express.

metadata: {'page': 14, 'source': 'NYSE_AXP_2021.pdf'}


In [44]:
query = "What is the result of annual Colleague Experience Survey in 2021?"
process_llm_response(qa_chain, query)

Query: What is the result of annual Colleague Experience Survey in 2021?

Inference time: 3.416 sec.

Result:  In 2021, 90% of colleagues who participated in the annual Colleague Experience Survey at American Express said they would recommend the company.

metadata: {'page': 14, 'source': 'NYSE_AXP_2021.pdf'}


In [43]:
query = "What is Blue box value?"
process_llm_response(qa_chain, query)

Query: What is Blue box value?

Inference time: 6.971 sec.

Result:  Blue box value refers to the amount that can be recovered from recycling blue box materials, such as paper, cardboard, and plastics. This value is determined by the market demand for these materials and the cost of collecting, sorting, and processing them. In some cases, the blue box value may be less than the cost of collecting and processing the materials, resulting in a net loss for the recycling program. In other cases, the blue box value may be higher, resulting in a net gain. The goal of many recycling programs is to maximize the blue box value while minimizing the cost of collection and processing.

metadata: {'page': 5, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [45]:
query = "What is the challenge American Express facing in 2021? "
process_llm_response(qa_chain, query)

Query: What is the challenge American Express facing in 2021? 

Inference time: 6.943 sec.

Result:  The high percentage of colleagues who participated in the survey and recommended American Express could be seen as a strength, but the context also mentions a potential challenge related to demand for and spending on American Express cards. This suggests that while American Express has a positive reputation among its employees, there could be external factors that may impact the company's business in the coming year. Without further context, it's unclear what these factors might be, but they could potentially include changes in consumer behavior, economic conditions, or competition in the credit card industry.

metadata: {'page': 14, 'source': 'NYSE_AXP_2021.pdf'}


In [22]:
query = "What are the pros and cons of the tactics that protesters use??"
process_llm_response(qa_chain, query)

Query: What are the pros and cons of the tactics that protesters use??

Inference time: 10.126 sec.

Result:  Some common tactics used by protesters include peaceful demonstrations, civil disobedience, and direct action. The pros of peaceful demonstrations include their ability to raise awareness and put pressure on authorities to address the issues being protested. Civil disobedience, such as sit-ins or blockades, can be effective in drawing attention to a cause and forcing authorities to respond. Direct action, such as occupying buildings or disrupting operations, can be more confrontational but can also lead to immediate results. However, the cons of these tactics include the potential for violence, arrests, and negative media coverage. Additionally, some tactics may not be effective in certain contexts or against certain types of authorities. Researchers can help to understand the effectiveness of different tactics by studying their outcomes and the factors that contribute to their

In [23]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x184fb110510>)

In [24]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:
