# Dataset FinanceBench

In [None]:
# !pip install qdrant-client

In [1]:
import pandas as pd
import os
import requests
from datasets import load_dataset
from datasets import DatasetDict
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
import time

import sentence_transformers
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
load_dotenv()

# Load OpenAI access
import sys
sys.path.append(os.path.abspath('../../src'))
from azure_openai_conn import OpenAIembeddings

In [2]:
# Turn huggingface dataset to pd
# images = fashion["image"]
# data = fashion.remove_columns("image")
# product_df = data.to_pandas()
# product_data = product_df.reset_index(drop=True).to_dict(orient="index")

if os.path.isfile('../../data/financebench_sample_150.csv'):
    df = pd.read_csv('../../data/financebench_sample_150.csv')
else:    
    ds = load_dataset("PatronusAI/financebench")
    df = pd.DataFrame(ds)
    all_dicts = []
    for index, row in df.iterrows():    
        dictionary = row['train']    
        all_dicts.append(dictionary)
    df = pd.DataFrame(all_dicts)

In [3]:

destination_folder = '../../data/financebench'

if not os.path.exists(destination_folder):

    os.makedirs(destination_folder)

    for index, row in df.iterrows():
        url = row['doc_link']
        doc_name = row['doc_name']
        doc_name_with_extension = doc_name + '.pdf'        
        file_path = os.path.join(destination_folder, doc_name_with_extension)
        response = requests.get(url)
        if response.status_code == 200:            
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded: {doc_name_with_extension}")
        else:
            print(f"Failed to download: {doc_name_with_extension} ({url})")


In [5]:
pdf_folder_path = destination_folder
documents = []
for file in os.listdir(pdf_folder_path)[:1]:
    print(file)
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())

COCACOLA_2021_10K.pdf


In [6]:
len(documents)

183

In [7]:
embeddings = OpenAIembeddings()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
chunked_documents = text_splitter.split_documents(documents)

In [7]:
# Initialize chroma database
chroma = Chroma.from_documents(documents=chunked_documents, embedding=embeddings, persist_directory='chroma')

In [8]:
# query it
query = "What is the Coca Cola Balance Sheet?"
docs = chroma.similarity_search(query)

# print results
print(docs[0].page_content)

THE COCA-COLA COMPANY AND SUBSIDIARIES
CONSOLIDATED BALANCE SHEETS
(In millions except par value)
December 31, 2021 2020
ASSETS
Current Assets   
Cash and cash equivalents $ 9,684 $ 6,795 
Short-term investments 1,242 1,771 
Total Cash, Cash Equivalents and Short-Term Investments 10,926 8,566 
Marketable securities 1,699 2,348 
Trade accounts receivable, less allowances of $516 and $526, respectively 3,512 3,144 
Inventories 3,414 3,266 
Prepaid expenses and other current assets 2,994 1,916 
Total Current Assets 22,545 19,240 
Equity method investments 17,598 19,273 
Other investments 818 812 
Other noncurrent assets 6,731 6,184 
Deferred income tax assets 2,129 2,460 
Property, plant and equipment — net 9,920 10,777 
Trademarks with indefinite lives 14,465 10,395 
Goodwill 19,363 17,506 
Other intangible assets 785 649 
Total Assets $ 94,354 $ 87,296 
LIABILITIES AND EQUITY
Current Liabilities   
Accounts payable and accrued expenses $ 14,619 $ 11,145 
Loans and notes payable 3,307 2,

In [9]:
for i in enumerate(docs):
    print(i)

(0, Document(page_content='THE COCA-COLA COMPANY AND SUBSIDIARIES\nCONSOLIDATED BALANCE SHEETS\n(In millions except par value)\nDecember 31, 2021 2020\nASSETS\nCurrent Assets   \nCash and cash equivalents $ 9,684 $ 6,795 \nShort-term investments 1,242 1,771 \nTotal Cash, Cash Equivalents and Short-Term Investments 10,926 8,566 \nMarketable securities 1,699 2,348 \nTrade accounts receivable, less allowances of $516 and $526, respectively 3,512 3,144 \nInventories 3,414 3,266 \nPrepaid expenses and other current assets 2,994 1,916 \nTotal Current Assets 22,545 19,240 \nEquity method investments 17,598 19,273 \nOther investments 818 812 \nOther noncurrent assets 6,731 6,184 \nDeferred income tax assets 2,129 2,460 \nProperty, plant and equipment — net 9,920 10,777 \nTrademarks with indefinite lives 14,465 10,395 \nGoodwill 19,363 17,506 \nOther intangible assets 785 649 \nTotal Assets $ 94,354 $ 87,296 \nLIABILITIES AND EQUITY\nCurrent Liabilities   \nAccounts payable and accrued expenses

# Mini Corpus of 5 Documents

In [8]:
destination_folder = '../../data/financebench'

In [9]:
pdf_folder_path = destination_folder
documents = []
for file in os.listdir(pdf_folder_path)[:5]:
    print(file)
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())

COCACOLA_2021_10K.pdf
PFIZER_2021_10K.pdf
VERIZON_2022_10K.pdf
PEPSICO_2021_10K.pdf
NETFLIX_2017_10K.pdf


In [10]:
save_document_object = False

if save_document_object:
    from  langchain.schema import Document
    import json
    from typing import Iterable

    def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
        with open(file_path, 'w') as jsonl_file:
            for doc in array:
                jsonl_file.write(doc.json() + '\n')

    def load_docs_from_jsonl(file_path)->Iterable[Document]:
        array = []
        with open(file_path, 'r') as jsonl_file:
            for line in jsonl_file:
                data = json.loads(line)
                obj = Document(**data)
                array.append(obj)
        return array
        
    save_docs_to_jsonl(documents,'data.json')
    docs2=load_docs_from_jsonl('data.json')
    print(len(docs2))

In [11]:
len(documents)

1072

In [12]:
chunk_size=1500
overlap=100

chunked_documents = text_splitter.split_documents(documents)

# Initialize chroma database
chroma = Chroma.from_documents(documents=chunked_documents, embedding=embeddings, persist_directory='db_chroma')

In [13]:
query = "What is the revenue of the company?"
docs = chroma.similarity_search(query)
print(docs[0].page_content)

Revenue by Category 
We have two reportable segments that we operate and manage as strategic business units, Consumer and Business. Revenue is disaggregated by products and services within Consumer, and customer groups (Small and Medium Business, Global Enterprise, Public Sector and Other, and Wholesale) within Business. See Note 13 for additional information on revenue by segment. Corporate and other primarily includes insurance captive revenues as well as the historical results of divested businesses, including Verizon Media. 
We also earn revenues that are not accounted for under Topic 606 from leasing arrangements (such as those for towers and equipment), captive reinsurance arrangements primarily related to wireless device insurance and the interest on equipment financed under a device payment plan agreement when sold to the customer by an authorized agent. As allowed by the practical expedient within Topic 842, we have elected to combine the lease and non-lease components for tho

# Reload Chroma

In [14]:
db3 = Chroma(persist_directory="db_chroma", embedding_function=embeddings)
docs = db3.similarity_search(query)
print(docs[0].page_content)

Revenue by Category 
We have two reportable segments that we operate and manage as strategic business units, Consumer and Business. Revenue is disaggregated by products and services within Consumer, and customer groups (Small and Medium Business, Global Enterprise, Public Sector and Other, and Wholesale) within Business. See Note 13 for additional information on revenue by segment. Corporate and other primarily includes insurance captive revenues as well as the historical results of divested businesses, including Verizon Media. 
We also earn revenues that are not accounted for under Topic 606 from leasing arrangements (such as those for towers and equipment), captive reinsurance arrangements primarily related to wireless device insurance and the interest on equipment financed under a device payment plan agreement when sold to the customer by an authorized agent. As allowed by the practical expedient within Topic 842, we have elected to combine the lease and non-lease components for tho