In [None]:
!pip install langchain

In [None]:
!pip install langchain-openai
!pip install langchain_community

In [None]:
!pip install sentence_transformers

In [None]:
!pip install chromadb

In [None]:
!pip install langchain-google-genai

In [None]:
# import libraries
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from google.colab import userdata

In [None]:
!pip install pypdf



In [None]:
CHROMA_PATH = "Chroma"
# ----- Data Indexing Process -----
# load your pdf docs
DOC_PATH = "/content/Test.pdf"
# load your pdf doc
loader = PyPDFLoader(DOC_PATH)
pages = loader.load()
# split the doc into smaller chunks i.e. chunk_size=500
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)

# get OpenAI Embedding model
#embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1", model_kwargs={'device': 'cpu'})
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# embed the chunks as vectors and load them into the database.
db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
# ----- Retrieval and Generation Process -----

# this is an example of a user question (query)
#query = 'what are the top risks mentioned in the document?'
query='Summarize Nike Fiscal 2025 results'

In [None]:
# retrieve context - top 5 most relevant (closests) chunks to the query vector
# (by default Langchain is using cosine distance metric)
docs_chroma = db_chroma.similarity_search_with_score(query, k=5)

# generate an answer based on given user query and retrieved context information
context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])
# you can use a prompt template
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
Answer the question based on the above context: {question}.
Provide a detailed answer.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""


In [None]:
# load retrieved context and user query in the prompt template
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query)

# call LLM model to generate the answer based on the given context and query
#using google genai instead of open AI here

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash" ,api_key=userdata.get('GoogleAPIKey'))

response_text = llm.predict(prompt)
print(response_text)

NIKE, Inc. reported financial results for its fiscal 2025 fourth quarter and full year ended May 31, 2025.

For the full fiscal year 2025:
*   Full year revenues were $46.3 billion, which was down 10 percent on a reported basis.
*   Net income was $0.2 billion, a decrease of 86 percent.
*   Diluted earnings per share was $0.14, also an 86 percent decrease.
*   Revenues for NIKE, Inc. specifically were $46.3 billion, down 10 percent on a reported basis and down 9 percent on a currency-neutral basis.
*   Revenues for the NIKE Brand were $44.7 billion, down 9 percent on a reported and currency-neutral basis, driven by declines across all geographies.
