In [None]:
!wget https://s202.q4cdn.com/436759741/files/doc_financials/2023/q4/Exhibit99_Q4_2023-c.pdf

In [1]:
###################################
########### INGESTION #############
###################################

In [2]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from unstructured.partition.utils.constants import PartitionStrategy


loader = UnstructuredPDFLoader(
    file_path="./Exhibit99_Q4_2023-c.pdf",
    strategy=PartitionStrategy.HI_RES,
    chunking_strategy="by_title",
    new_after_n_chars=4000,  # Soft-max
    max_characters=4000,  # Hard-max
    combine_text_under_n_chars=2000,  # Combine chunks of < 200 chars
    mode='elements',  # Split the documents into elements such as Title and NarrativeText.
)
data = loader.load()

In [3]:
len(data)

32

In [6]:
data

[Document(page_content='<epam>\n\nEPAM Reports Results for Fourth Quarter and Full Year 2023\n\nFourth Quarter 2023\n\n• Revenues of $1.157 billion, down 6.0% year-over-year\n\n• GAAP Income from Operations was 10.6% of revenues and Non-GAAP Income from Operations was 17.3% of revenues\n\n• GAAP Diluted EPS of $1.66, a decrease of 36.4%, and Non-GAAP Diluted EPS of $2.75, a decrease of 6.1% on a year-over-year basis\n\nFull Year 2023\n\n• Revenues of $4.691 billion, down 2.8% year-over-year\n\n• GAAP Income from Operations was 10.7% of revenues and Non-GAAP Income from Operations was 16.3% of revenues\n\n• GAAP Diluted EPS of $7.06, a decrease of 0.4%, and Non-GAAP Diluted EPS of $10.59, a decrease of 2.8% on a year-over-year basis\n\nNewtown, PA, USA, February 15, 2024 — EPAM Systems, Inc. (NYSE: EPAM), a leading digital transformation services and product engineering company, today announced results for its fourth quarter and full year ended December 31, 2023.\n\n“EPAM\'s performance

In [4]:
data[0].__dict__

{'page_content': '<epam>\n\nEPAM Reports Results for Fourth Quarter and Full Year 2023\n\nFourth Quarter 2023\n\n• Revenues of $1.157 billion, down 6.0% year-over-year\n\n• GAAP Income from Operations was 10.6% of revenues and Non-GAAP Income from Operations was 17.3% of revenues\n\n• GAAP Diluted EPS of $1.66, a decrease of 36.4%, and Non-GAAP Diluted EPS of $2.75, a decrease of 6.1% on a year-over-year basis\n\nFull Year 2023\n\n• Revenues of $4.691 billion, down 2.8% year-over-year\n\n• GAAP Income from Operations was 10.7% of revenues and Non-GAAP Income from Operations was 16.3% of revenues\n\n• GAAP Diluted EPS of $7.06, a decrease of 0.4%, and Non-GAAP Diluted EPS of $10.59, a decrease of 2.8% on a year-over-year basis\n\nNewtown, PA, USA, February 15, 2024 — EPAM Systems, Inc. (NYSE: EPAM), a leading digital transformation services and product engineering company, today announced results for its fourth quarter and full year ended December 31, 2023.\n\n“EPAM\'s performance in 20

In [7]:
[doc.metadata['category'] for doc in data]

['CompositeElement',
 'Table',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'CompositeElement',
 'CompositeElement',
 'CompositeElement',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'CompositeElement',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement',
 'Table',
 'CompositeElement']

In [8]:
data[2].page_content  # Table

'Americas EMEA APAC GEE* S676M $454M S26M S1M -7.6% | -0.3%1 -10.9% 1 -91.6% J YoY YoY YoY YoY'

In [None]:
###################################
########### RETRIEVER #############
###################################

In [9]:
from dotenv import load_dotenv, find_dotenv
import os

env_path = find_dotenv()

if env_path:
    print(f".env file found at: {env_path}")
else:
    print("No .env file found.")

load_dotenv(dotenv_path=env_path)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


.env file found at: /Users/Sanchit_Balchandani/workspace/python-ws/multimodal-rag/.env


In [None]:
# from getpass import getpass

# OPENAI_API_KEY = getpass('Enter OpenAI Key: ')

In [10]:
from langchain_openai import OpenAIEmbeddings


OPENAI_EMBEDDING_MODEL = OpenAIEmbeddings(model='text-embedding-ada-002', api_key=OPENAI_API_KEY)

In [11]:
from langchain_chroma import Chroma

vectorstore = Chroma(
    collection_name='OSM-21-Oct-2024-tradition-v3',
    embedding_function=OPENAI_EMBEDDING_MODEL,
    collection_metadata={"hnsw:space": "cosine"},
)

In [12]:
retriever = vectorstore.as_retriever()

In [13]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x307b3bf80>)

In [14]:
from langchain.vectorstores.utils import filter_complex_metadata

# Filter complex metadata
data = filter_complex_metadata(data)
retriever.vectorstore.add_documents(data)

['cb302d55-8caa-4ac6-b76f-f99f7396eb8c',
 'c8f95960-9839-4ef9-b5bb-ec7d47e4ca2c',
 '20ee2080-6162-4999-ae7e-6e35538f156d',
 'ad6b7ca9-c98c-42c3-9fae-90226e6594af',
 'a17eb8c5-c512-4074-bcb5-f9243be6243c',
 'd25f2452-ecc1-41e7-b42d-f6f4aa5ab48a',
 '582b97d3-9c7c-4f54-9b94-06e1e00d9e61',
 '1bc34dba-1fd4-4779-97bb-cd0c09f89ce7',
 'db8244a2-38ad-46fc-9d2f-9bc9abaa56d6',
 '94a7ce02-7327-4b0c-88d3-4a2a733dcf24',
 '596c9072-5460-44cb-9647-ef9d26a1de5a',
 'd69313e9-1c4f-4189-80ff-e2ac62650c50',
 '9865270e-746c-41a3-9b85-01d40b5edcd9',
 'f29c0113-64e6-4611-8237-a6285541b4ed',
 '8ecaff55-c65c-4651-9406-864c76e27c70',
 '56e7b243-1c01-4db5-955e-529c6fd14acf',
 'fd135402-f4c9-44ab-b230-370a968675c3',
 'a363fa4c-5b38-4fbe-ace2-fff482b20765',
 '41084011-f64f-41b5-ac03-5d7053bda2f0',
 'b5f6e21e-00bb-4aa8-bd2e-527a479908e9',
 '5d667247-62df-4223-9724-a49173644615',
 '9fa10c64-fc60-46a1-a208-c717f5a42df5',
 '142e7e1e-d4c8-4c94-8775-575bc4a2a7e5',
 '4dc9ba87-58a0-49b4-bbd4-a8a1591c92ae',
 '3b8797be-03fc-

In [15]:
vectorstore.get(include=["metadatas", "documents", "embeddings"])

{'ids': ['cb302d55-8caa-4ac6-b76f-f99f7396eb8c',
  'c8f95960-9839-4ef9-b5bb-ec7d47e4ca2c',
  '20ee2080-6162-4999-ae7e-6e35538f156d',
  'ad6b7ca9-c98c-42c3-9fae-90226e6594af',
  'a17eb8c5-c512-4074-bcb5-f9243be6243c',
  'd25f2452-ecc1-41e7-b42d-f6f4aa5ab48a',
  '582b97d3-9c7c-4f54-9b94-06e1e00d9e61',
  '1bc34dba-1fd4-4779-97bb-cd0c09f89ce7',
  'db8244a2-38ad-46fc-9d2f-9bc9abaa56d6',
  '94a7ce02-7327-4b0c-88d3-4a2a733dcf24',
  '596c9072-5460-44cb-9647-ef9d26a1de5a',
  'd69313e9-1c4f-4189-80ff-e2ac62650c50',
  '9865270e-746c-41a3-9b85-01d40b5edcd9',
  'f29c0113-64e6-4611-8237-a6285541b4ed',
  '8ecaff55-c65c-4651-9406-864c76e27c70',
  '56e7b243-1c01-4db5-955e-529c6fd14acf',
  'fd135402-f4c9-44ab-b230-370a968675c3',
  'a363fa4c-5b38-4fbe-ace2-fff482b20765',
  '41084011-f64f-41b5-ac03-5d7053bda2f0',
  'b5f6e21e-00bb-4aa8-bd2e-527a479908e9',
  '5d667247-62df-4223-9724-a49173644615',
  '9fa10c64-fc60-46a1-a208-c717f5a42df5',
  '142e7e1e-d4c8-4c94-8775-575bc4a2a7e5',
  '4dc9ba87-58a0-49b4-bbd4-

In [None]:
vectorstore.get()

In [16]:
vectorstore._collection.count()

32

In [17]:
query = "Give me the revenue for financial services for year 2024?"
docs = retriever.invoke(query, limit=5)

In [18]:
[doc.page_content for doc in data]

['<epam>\n\nEPAM Reports Results for Fourth Quarter and Full Year 2023\n\nFourth Quarter 2023\n\n• Revenues of $1.157 billion, down 6.0% year-over-year\n\n• GAAP Income from Operations was 10.6% of revenues and Non-GAAP Income from Operations was 17.3% of revenues\n\n• GAAP Diluted EPS of $1.66, a decrease of 36.4%, and Non-GAAP Diluted EPS of $2.75, a decrease of 6.1% on a year-over-year basis\n\nFull Year 2023\n\n• Revenues of $4.691 billion, down 2.8% year-over-year\n\n• GAAP Income from Operations was 10.7% of revenues and Non-GAAP Income from Operations was 16.3% of revenues\n\n• GAAP Diluted EPS of $7.06, a decrease of 0.4%, and Non-GAAP Diluted EPS of $10.59, a decrease of 2.8% on a year-over-year basis\n\nNewtown, PA, USA, February 15, 2024 — EPAM Systems, Inc. (NYSE: EPAM), a leading digital transformation services and product engineering company, today announced results for its fourth quarter and full year ended December 31, 2023.\n\n“EPAM\'s performance in 2023 reflects our 

In [None]:
###################################
########### Synthesis #############
###################################

In [19]:
from langchain_openai import ChatOpenAI

CHAT_MODEL = ChatOpenAI(model_name='gpt-4o-mini', api_key=OPENAI_API_KEY, temperature=0)

In [20]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


RAG_TEMPLATE = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context 
to answer the question. If you don't know the answer, just say that you don't know.

<context>
{context}
</context>

Answer the following question:

{question}"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | CHAT_MODEL
    | StrOutputParser()
)

In [23]:
query = "Give me the revenue for financial services for year 2023?"
response = rag_chain.invoke(query)

In [24]:
response

'The revenue for Financial Services for the year 2023 is $242 million.'

In [25]:

query = query = "What are the assets EPAM hold for 2022, 2023"
response = rag_chain.invoke(query)

In [26]:
response

"The retrieved context does not provide specific information about the total assets held by EPAM for the years 2022 and 2023. Therefore, I don't know the answer."