In [1]:
import os
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from dotenv import load_dotenv
import pickle

In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [3]:
llm = OpenAI(openai_api_key=openai_api_key, temperature=0.9, max_tokens=500)

## Load Data

In [4]:
loader = UnstructuredURLLoader(
    urls=[
        "https://www.reuters.com/business/finance/caixabanks-q3-net-profit-rises-70-same-period-2022-2023-10-27/",
        "https://www.reuters.com/business/finance/danske-bank-q3-profit-exceeds-expectations-2023-10-27/",
        "https://www.cnbc.com/2023/10/26/amazons-profit-margin-nears-record-high-after-ceo-jassys-cost-cuts.html"
    ]
)

data = loader.load()
len(data)

3

## Split data into chunks

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)
len(docs)

21

## Create embeddings for these chunks and save them to FAISS index

In [7]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings in order to create FAISS vector index
vector_indexes = FAISS.from_documents(docs, embeddings)

In [8]:
# Storing vector index create in local
file_path="../Data/vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vector_indexes, f)

In [9]:
# load saved pickle file
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [10]:
vectorIndex

<langchain.vectorstores.faiss.FAISS at 0x175da8310>

## Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [11]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [12]:
# turn on lungchain debug mode to see what is going on internally
query = "What was the key highlight of Amazon's July-September period financial report?"
# query = "what are the main features of punch iCNG?"
Give three key points of this article.
langchain.debug=True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What was the key highlight of Amazon's July-September period financial report?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Suddenly, Amazon is a profit machine.\n\nIn its third-quarter earnings report on Thursday, Amazon reported an operating margin of 7.8%, the highest since it reached a record of 8.2% in the first quarter of 2021. The company's operating margin, which is the profit left after subtracting costs to operate the business, was 2% a year ago and has historically hovered in the low single digits. Bezos was perfectly comfortably running with a

{'answer': ' Amazon reported record high operating margin of 7.8% for the July-September period. \n',
 'sources': 'https://www.cnbc.com/2023/10/26/amazons-profit-margin-nears-record-high-after-ceo-jassys-cost-cuts.html'}