In [1]:
import uuid

import warnings
warnings.filterwarnings("ignore")

from langchain.embeddings import OpenAIEmbeddings

from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI

from langchain.retrievers import ParentDocumentRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import (ChatPromptTemplate,SystemMessagePromptTemplate,HumanMessagePromptTemplate , PromptTemplate)

from langchain.storage import InMemoryStore

from langchain.chains import RetrievalQA



In [2]:
loader = PyPDFLoader('../Docs/annualreport.pdf')

In [3]:
docs = loader.load()

In [4]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="split_parents", embedding_function=OpenAIEmbeddings())
# The storage layer for the parent documents
store = InMemoryStore()

In [5]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [6]:
retriever.add_documents(documents=docs , ids = None)

len(list(store.yield_keys()))

662

In [7]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List
from operator import itemgetter

from langchain.schema.runnable import RunnablePassthrough

In [8]:

# Define your desired data structure.
class FinanceInfo(BaseModel):
    year: int = Field(description="year")
    gross_income: int = Field(description="gross income")
    operating_expenses: int = Field(description="operating expenses")
    tax_paid: int = Field(description="amount paid as taxes")
    net_income: int = Field(description="net income")


class FinanceInfos(BaseModel):
    FinanceInfos: List[FinanceInfo]

parser = PydanticOutputParser(pydantic_object=FinanceInfos)

In [9]:
from langchain.prompts import ChatPromptTemplate

template = """Use the following pieces of context to answer the question at the end. 

{context}

{format_instructions}

Question: {question}
"""

#template = template + parser.get_format_instructions()


prompt = ChatPromptTemplate.from_template(template)



In [10]:
chat = ChatOpenAI(temperature = 0.0, model_name="gpt-3.5-turbo-16k-0613")

In [11]:
chain_qa = (
    {
    "context": itemgetter("question") | retriever, 
    "format_instructions" : lambda x: parser.get_format_instructions(),
    "question": itemgetter("question")
} | prompt | chat | parser
)


In [12]:
res = chain_qa.invoke({"question" : "What is the Gross Income, Operating Expenses, Tax Paid and Net Income for the past 5 years?"})

In [13]:
res.FinanceInfos

[FinanceInfo(year=2022, gross_income=17324, operating_expenses=10785, tax_paid=1586, net_income=4953),
 FinanceInfo(year=2021, gross_income=12774, operating_expenses=8867, tax_paid=899, net_income=3008),
 FinanceInfo(year=2020, gross_income=12325, operating_expenses=8871, tax_paid=728, net_income=2726),
 FinanceInfo(year=2019, gross_income=12754, operating_expenses=8887, tax_paid=879, net_income=2988),
 FinanceInfo(year=2018, gross_income=10920, operating_expenses=7456, tax_paid=883, net_income=2581)]

In [21]:
import pandas as pd

df = pd.DataFrame([dict(s) for s in res.FinanceInfos])

df = df.assign( check = df.gross_income - df.operating_expenses - df.tax_paid )


df.assign( check2 = df.net_income == df['check'] )

Unnamed: 0,year,gross_income,operating_expenses,tax_paid,net_income,check,check2
0,2022,17324,10785,1586,4953,4953,True
1,2021,12774,8867,899,3008,3008,True
2,2020,12325,8871,728,2726,2726,True
3,2019,12754,8887,879,2988,2988,True
4,2018,10920,7456,883,2581,2581,True


### ==============  END ==========================

In [None]:
pages = loader.load_and_split()



In [None]:
sub_docs = vectorstore.similarity_search("Net Profit, Expenses, Profit before tax and Net income")
print(sub_docs[0].page_content)

In [None]:
retrieved_docs = retriever.get_relevant_documents("Net Profit, Expenses, Profit before tax and Net income")

print(len(retrieved_docs))

print(len(retrieved_docs[0].page_content))

print(retrieved_docs[2].page_content)

In [None]:
template="You are an experienced financial analyst who use the data provided --- {Data}. \
    To answer the questions from the user. If you cannot answer the question from the data provided, \
        just say so and do not attempt to answer."

system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_template="{Question}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

In [None]:
qdocs = "".join([retrieved_docs[i].page_content for i in range(len(retrieved_docs))])

In [None]:
chat_input = prompt_template.format_messages(
    Data = qdocs,
    Question = "What is the Gross Revenue, Operating Expenses, Profit before tax and Net income for the past 3 years?"
)

In [None]:
response = chat(chat_input)

In [None]:
print(response.content)

In [None]:
prompt_template = """You are an expert financial analyst. Use the following pieces of context to answer the question at the end. \
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)


chain_type_kwargs = {"prompt": PROMPT}

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=chat, 
    chain_type="stuff", 
    retriever=retriever, 
    chain_type_kwargs=chain_type_kwargs,
    verbose=True
    )

In [None]:
query = "What is the Gross Income, Operating Expenses, Tax Paid and Net Income for the past 3 years?"

res = qa_chain.run(query)

print(res)

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """You are an expert financial analyst. Use the following pieces of context to answer the question at the end. \
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)



In [None]:

from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser

In [None]:
#retriever2 = vectorstore.as_retriever()


chain_qa = (
    {
    "context": itemgetter("question") | retriever, 
    "question": itemgetter("question")
} | prompt | chat |StrOutputParser()
)

In [None]:
res = chain_qa.invoke({"question" : "What is the Gross Income, Operating Expenses, Tax Paid and Net Income for the past 3 years?"})

In [None]:
from IPython.display import display,Markdown
display(Markdown(res) )

In [None]:
chain_qa_old = (
    {
    "context": itemgetter("question") | retriever, 
    "format_instructions" : itemgetter("format_instructions"),
    "question": itemgetter("question")
} | prompt | chat | parser
)

res = chain_qa.invoke({"question" : "What is the Gross Income, Operating Expenses, Tax Paid and Net Income for the past 3 years?",\
                       "format_instructions" : parser.get_format_instructions()})

In [None]:
t = res.FinanceInfos[0]

In [None]:
t.gross_income - t.operating_expenses - t.tax_paid

In [None]:
df.plot(x='year' , y='net_income')

In [None]:
t.net_income

In [None]:
prompt = ChatPromptTemplate.from_template("tell me a joke about {topic}")

chain = prompt | chat

for s in chain.stream({"topic": "bears"}):
    print(s.content, end="", flush=True)