# Using LangChain Agent for analysis of Financial statements

Financial statements of World Health Organization has been used.

In [None]:
# OPENAI_API_KEY=""

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

## Extract Text from PDFs

In [2]:
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Extract text from all PDFs
fs_2021 = extract_text_from_pdf("data/fs_2021.pdf")
fs_2022 = extract_text_from_pdf("data/fs_2022.pdf")
fs_2023 = extract_text_from_pdf("data/fs_2023.pdf")
basic_understanding = extract_text_from_pdf("data/basic-understanding-of-a-companys-financials.pdf")

## Split Text into Sentences and Words

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,    # try different values
    chunk_overlap=200,  # Overlap to maintain context
    length_function=len
)

# Split the text
fs_2021_chunks = text_splitter.split_text(fs_2021)
fs_2022_chunks = text_splitter.split_text(fs_2022)
fs_2023_chunks = text_splitter.split_text(fs_2023)
basic_understanding_chunks = text_splitter.split_text(basic_understanding)

## Tokenize the text

In [4]:
# Tokenization is handled internally by LangChain and OpenAI embeddings,

## Create Context-Aware and Regular Word Embeddings

In [5]:
import pickle
import os
from langchain_openai import OpenAIEmbeddings

# Define paths for saving pickled embeddings
fs_2021_pickle_path = "data/fs_2021_embeddings.pkl"
fs_2022_pickle_path = "data/fs_2022_embeddings.pkl"
fs_2023_pickle_path = "data/fs_2023_embeddings.pkl"
basic_understanding_pickle_path = "basic_understanding_embeddings.pkl"

# Function to save embeddings to a pickle file
def save_embeddings_to_pickle(embeddings, file_path):
    with open(file_path, "wb") as f:
        pickle.dump(embeddings, f)

# Function to load embeddings from a pickle file
def load_embeddings_from_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

# Check if pickled embeddings already exist
if os.path.exists(fs_2021_pickle_path) and os.path.exists(fs_2022_pickle_path) and os.path.exists(fs_2023_pickle_path) and os.path.exists(basic_understanding_pickle_path):
    print("Loading embeddings from pickle files...")
    fs_2021_embeddings = load_embeddings_from_pickle(fs_2021_pickle_path)
    fs_2022_embeddings = load_embeddings_from_pickle(fs_2022_pickle_path)
    fs_2023_embeddings = load_embeddings_from_pickle(fs_2023_pickle_path)
    basic_understanding_embeddings = load_embeddings_from_pickle(basic_understanding_pickle_path)
else:
    print("Generating embeddings and saving to pickle files...")
    # Initialize OpenAI embeddings
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

    # Generate embeddings for each chunk
    fs_2021_embeddings = embeddings.embed_documents(fs_2021_chunks)
    fs_2022_embeddings = embeddings.embed_documents(fs_2022_chunks)
    fs_2023_embeddings = embeddings.embed_documents(fs_2023_chunks)
    basic_understanding_embeddings = embeddings.embed_documents(basic_understanding_chunks)

    # Save embeddings to pickle files
    save_embeddings_to_pickle(fs_2021_embeddings, fs_2021_pickle_path)
    save_embeddings_to_pickle(fs_2022_embeddings, fs_2022_pickle_path)
    save_embeddings_to_pickle(fs_2023_embeddings, fs_2023_pickle_path)
    save_embeddings_to_pickle(basic_understanding_embeddings, basic_understanding_pickle_path)

Loading embeddings from pickle files...


## Update ChromaDB Client Initialization

In [6]:
import chromadb
from chromadb.config import Settings
print("ChromaDB and Settings imported successfully!")

# Initialize ChromaDB client in embedded mode
# client = chromadb.Client(Settings(
#     chroma_db_impl="duckdb+parquet",  # Use DuckDB with Parquet for storage
#     persist_directory="./chroma_db"   # Directory to store the database
# ))

client = chromadb.PersistentClient(
    path="./chroma_db"  # Directory to store the database
)

# Create or load a collection
collection = client.get_or_create_collection(name="financial_statements")

print("Collection created successfully!")

ChromaDB and Settings imported successfully!
Collection created successfully!


##  Store Embeddings in ChromaDB

In [8]:
# Add embeddings to the collection
for i, (chunk, embedding) in enumerate(zip(fs_2021_chunks, fs_2021_embeddings)):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        metadatas=[{"year": 2021, "source": "fs_2021.pdf"}],
        ids=[f"fs_2021_{i}"]
    )

for i, (chunk, embedding) in enumerate(zip(fs_2022_chunks, fs_2022_embeddings)):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        metadatas=[{"year": 2022, "source": "fs_2022.pdf"}],
        ids=[f"fs_2022_{i}"]
    )

for i, (chunk, embedding) in enumerate(zip(fs_2023_chunks, fs_2023_embeddings)):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        metadatas=[{"year": 2023, "source": "fs_2023.pdf"}],
        ids=[f"fs_2023_{i}"]
    )

for i, (chunk, embedding) in enumerate(zip(basic_understanding_chunks, basic_understanding_embeddings)):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        metadatas=[{"source": "basic-understanding-of-a-companys-financials.pdf"}],
        ids=[f"basic_understanding_{i}"]
    )

## Verify ChromaDB Population

In [12]:
# Query the Collection
# Retrieve all items from the collection
items = collection.get()

In [13]:
# Inspect the keys in the returned dictionary
print("Keys in the collection:", items.keys())

Keys in the collection: dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])


In [11]:
# Print the first few documents and their metadata
for doc, metadata in zip(items["documents"][:5], items["metadatas"][:5]):
    print("Document:", doc)
    print("Metadata:", metadata)
    print("-" * 50)

Keys in the collection: dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])
Document: AUDITED FINANCIAL STATEMENTS  
FOR THE YEAR ENDED 31 DECEMBER 2021A75/33  
SEVENTY-FIFTH WORLD HEALTH ASSEMBLY  
Provisional agenda item 22.1  
13 May 2022Table of contents
Financial statements at a glance        2
Director-General’s summary          4
2021 Statement of Internal Control        18
Certification of financial statements for the year ended 31 December 2021      22
Opinion of the External Auditor        23
Independent Auditor’s report         25
Financial statements          28
 Statement I. Statement of Financial Position      28
 Statement II. Statement of Financial Performance      29
 Statement III. Statement of Changes in Net Assets/Equity     30
 Statement IV. Statement of Cash Flow        31
 Statement V. Statement of Comparison of Budget and Actual Amounts  32
1. Notes to the financial statements         33
2. Significant accounting policies      

In [14]:
# Check the Number of Items
print("Number of items in the collection:", len(items["ids"]))

Number of items in the collection: 900


## Persist the Database

In [25]:
# client.persist()
# AttributeError: 'Client' object has no attribute 'persist'

## Create LangChain Agents and Prompts

In [16]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Initialize the vector store
vector_store = Chroma(
    client=client,
    collection_name="financial_statements",
    embedding_function=embeddings
)

# Initialize the LLM
llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4", temperature=0)

# Create a retrieval-based QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever()
)

# Define a prompt template
prompt_template = """
You are a financial analyst. Use the provided financial statements and the document "basic understanding of a company's financials" to answer the following question:

Question: {question}
"""

# Function to query the agent
def query_agent(question):
    prompt = prompt_template.format(question=question)
    response = qa_chain.invoke(prompt)
    return response

# Example query
question = "What are the key trends in the company's revenue from 2021 to 2023?"
answer = query_agent(question)
print(answer)


{'query': '\nYou are a financial analyst. Use the provided financial statements and the document "basic understanding of a company\'s financials" to answer the following question:\n\nQuestion: What are the key trends in the company\'s revenue from 2021 to 2023?\n', 'result': "The key trends in the company's revenue from 2021 to 2023 are as follows:\n\n- In 2021, the company's revenue was US$ 4066 million.\n- In 2022, the company's revenue increased to US$ 4354 million, which was a rise of US$ 288 million compared to 2021.\n- However, in 2023, the company's revenue declined to US$ 3341 million, a decrease of US$ 1013 million compared to 2022.\n\nThe main drivers of the decrease in revenue in 2023 were the reduction in voluntary contributions by US$ 911 million and the reduction in in-kind contributions by US$ 91 million."}


## Test the Agent

In [19]:
question2 = "Explain the trends in the organization's cash flow statement from 2021 to 2022 based on the basic understanding document?"
answer2 = query_agent(question2)
print(answer2)

{'query': '\nYou are a financial analyst. Use the provided financial statements and the document "basic understanding of a company\'s financials" to answer the following question:\n\nQuestion: Explain the trends in the organization\'s cash flow statement from 2021 to 2022 based on the basic understanding document?\n', 'result': 'Based on the provided cash flow statement and the "basic understanding of a company\'s financials" document, we can observe several trends from 2021 to 2022:\n\n1. The total deficit/surplus for the year increased significantly from 350,734 in 2021 to 600,100 in 2022. This indicates that the company\'s operations have become more profitable or efficient, leading to a higher surplus of cash from operating activities.\n\n2. Depreciation and amortization slightly increased from 12,896 in 2021 to 14,093 in 2022. This could be due to the company acquiring more depreciable assets or the existing assets getting older.\n\n3. The company experienced a significant decreas

In [22]:
question3 = "How has the cash and assets changed over the years?"
answer3 = query_agent(question3)
print(answer3)

{'query': '\nYou are a financial analyst. Use the provided financial statements and the document "basic understanding of a company\'s financials" to answer the following question:\n\nQuestion: How has the cash and assets changed over the years?\n', 'result': "I'm sorry, but I can't provide the information you're looking for because the documents provided do not include historical data or comparisons over years. They only provide a basic understanding of what financial statements are and how to read them."}


In [23]:
question4 = "How has the contribution to the revenue changed over the years?"
answer4 = query_agent(question4)
print(answer4)

{'query': '\nYou are a financial analyst. Use the provided financial statements and the document "basic understanding of a company\'s financials" to answer the following question:\n\nQuestion: How has the contribution to the revenue changed over the years?\n', 'result': 'The assessed contributions to the revenue have seen some fluctuations over the years. In 2020, the assessed contributions were US$ 465.9 million. This increased to US$ 549.3 million in 2021, showing a significant rise. However, in 2023, the assessed contributions decreased to US$ 494.1 million. \n\nVoluntary contributions also showed a similar trend. In 2020, the voluntary contributions were US$ 3,704,226 thousands. This decreased to US$ 3,365,228 thousands in 2021. The total voluntary contributions for the financial period 2022-2023 were US$ 6,333 million, showing a significant increase from the previous years.\n\nOverall, the total revenue (all sources) increased from US$ 6,017 million in 2018-2019 to US$ 8,365 milli