In [1]:
import os
from keys import OPENAI_API_KEY, ACTIVELOOP_TOKEN
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['ACTIVELOOP_TOKEN'] = ACTIVELOOP_TOKEN

## FinanceGPT Demo

In [14]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI
from langchain.chat_models.openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PagedPDFSplitter

In [18]:
import requests
import tqdm
from typing import List

# Financial Reports of Amazon, but can be replaced by any URLs of pdfs
urls = ['https://s2.q4cdn.com/299287126/files/doc_financials/2021/q1/Amazon-Q1-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q2/AMZN-Q2-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q3/Q3-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q4/business_and_financial_update.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q1/Q1-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q2/Q2-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q3/Q3-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q4/Q4-2022-Amazon-Earnings-Release.pdf'
        ]

def load_reports(urls: List[str]) -> List[str]:
    """ Load pages from a list of urls"""
    pages = []

    for url in tqdm.tqdm(urls):
        r = requests.get(url)
        path = url.split('/')[-1]
        with open(path, 'wb') as f:
            f.write(r.content)
        loader = PagedPDFSplitter(path)
        local_pages = loader.load_and_split()
        pages.extend(local_pages)
    return pages

pages = load_reports(urls)

100%|██████████| 8/8 [00:24<00:00,  3.07s/it]


In [19]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings()

db = DeepLake(dataset_path="hub://bharatr/amazon_earnings_1", embedding=embeddings, token=os.environ['ACTIVELOOP_TOKEN'])
db.add_documents(texts)

Using embedding function is deprecated and will be removed in the future. Please use embedding instead.


Deep Lake Dataset in hub://bharatr/amazon_earnings_1 already exists, loading from the storage


/

Dataset(path='hub://bharatr/amazon_earnings_1', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
 embedding  embedding  (194, 1536)  float32   None   
    id        text      (194, 1)      str     None   
 metadata     json      (194, 1)      str     None   
   text       text      (194, 1)      str     None   


 

['6e9ec50a-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec65e-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec6ae-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec6e0-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec712-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec73a-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec762-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec78a-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec7b2-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec7da-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec802-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec82a-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec852-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec87a-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec898-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec8c0-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec8e8-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec91a-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec938-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec960-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec988-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec9b0-3d28-11ee-a8d3-78af089a20b3',
 '6e9ec9d8-3d28-11ee-a8d3-78af089a20b3',
 '6e9eca00-3d28-11ee-a8d3-78af089a20b3',
 '6e9eca28-3d28-

In [20]:
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model='gpt-3.5-turbo'), chain_type='stuff', retriever=db.as_retriever())

In [24]:
qa.run("Combine total revenue in 2022?")

'The total revenue for Amazon.com in 2022 was $513.98 billion.'

In [22]:
qa.run("What is the revenue in 2021 Q3?")

'The net sales for Amazon.com in the third quarter of 2021 were $110.8 billion.'

In [25]:
qa.run("What is the revenue in 2023 Q1?")
# The financial results for 2023 Q1 are not provided in the given context - Hallucinations?


'The revenue for the first quarter of 2023 is expected to be between $121.0 billion and $126.0 billion, representing a growth of 4% to 8% compared to the first quarter of 2022.'