In [5]:
import os
from docling.document_converter import DocumentConverter
from pathlib import Path
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
import warnings
import logging
import tensorflow as tf

In [6]:
# Document conversion
def load_and_convert_document(file_path):
    converter = DocumentConverter()
    result = converter.convert(file_path)
    return result.document.export_to_markdown()

source = "amazon-10-q-q3-2024.pdf"
markdown_content = load_and_convert_document(source)

2025-11-09 21:01:28,788 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-09 21:01:28,982 - INFO - Going to convert document batch...
2025-11-09 21:01:28,988 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f9730ffaa6e7f8d4fb0c98c8df3f18cb
2025-11-09 21:01:29,253 - INFO - Loading plugin 'docling_defaults'
2025-11-09 21:01:29,274 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-09 21:01:29,341 - INFO - Loading plugin 'docling_defaults'
2025-11-09 21:01:29,362 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-09 21:01:30,282 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-09 21:01:30,320 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-09 21:01:30,339 [RapidOCR] download_file.py:60: File exists and is valid: C:\tenserflow project\tfvenv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-09 21:01:30,352 [RapidO

In [34]:
# Splitting markdown content into chunks
def get_markdown_splits(markdown_content):
    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    return markdown_splitter.split_text(markdown_content)
chunks = get_markdown_splits(markdown_content)

In [35]:
# Embedding and vector store setup
def setup_vector_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    single_vector = embeddings.embed_query("this is some text data")
    index = faiss.IndexFlatL2(len(single_vector))
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={}
    )
    vector_store.add_documents(documents=chunks)
    return vector_store

vector_store = setup_vector_store(chunks)

# Setup retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 3})

2025-11-09 21:17:58,624 - INFO - Use pytorch device_name: cpu
2025-11-09 21:17:58,624 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [29]:
print(vector_store.index.ntotal, len(chunks))

188 188


In [30]:
retriever.invoke(" what was Amazon's total revenue in Q3 2024")

[Document(metadata={'Header 2': 'Sales and Marketing'}, page_content='## Sales and Marketing  \nSales and marketing costs include advertising and payroll and related expenses for personnel engaged in marketing and selling activities, including sales commissions related to AWS. We direct customers to our stores primarily through a number of marketing channels, such as our sponsored search, third-party customer referrals, social and online advertising, television advertising, and other initiatives. Our marketing costs are largely variable, based on growth in sales and changes in rates. To the extent there is increased or decreased competition for these traffic sources, or to the extent our mix of these channels shifts, we would expect to see a corresponding change in our marketing costs.  \nSales and marketing costs in Q3 2024 did not significantly change compared to the comparable prior year period. The decrease in sales and marketing costs for the nine months ended September 30, 2024, 

In [44]:
# Formatting document for RAG
def formate_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [47]:
# Setting up the RAG chain
def create_rag_chain(retriever):
    prompt = """
        You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
        Question: {question}
        Context: {context}
        Answer:
    """
    model = ChatGroq(model_name="llama-3.1-8b-instant", api_key = "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")  # ‚úÖ Choose a Groq-supported model
    prompt_template = ChatPromptTemplate.from_template(prompt)
    return (
        {"context": retriever | formate_docs, "question": RunnablePassthrough()}
        | prompt_template
        | model
        | StrOutputParser()
    )

In [48]:

# Create RAG chain
rag_chain = create_rag_chain(retriever)

questions = [
    "What was Amazon‚Äôs total revenue in Q3 2024?",
    "How does the revenue in Q3 2024 compare to Q3 2023?",
    "What was the net income for Q3 2024, and how does it compare year over year?",
    "What were the earnings per share (basic and diluted) in Q3 2024?",
    "How much did Amazon earn from product vs. service sales?",
    "What were the main operating expense categories and their values in Q3 2024?",
    "What was Amazon‚Äôs operating income in Q3 2024?",
    ]


for question in questions:
      print(f"Question: {question}")
      for chunk in rag_chain.stream(question):
          print(chunk, end="", flush=True)
      print("\n" + "-" * 50 + "\n")

Question: What was Amazon‚Äôs total revenue in Q3 2024?


2025-11-09 21:32:21,225 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


* I don't know Amazon's total revenue in Q3 2024 based on the provided context.
--------------------------------------------------

Question: How does the revenue in Q3 2024 compare to Q3 2023?


2025-11-09 21:32:21,550 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Unfortunately, the provided context doesn't contain information about Q3 2024 revenue or Q3 2023 revenue. However, it does contain information about the net sales guidance for Q4 2024, which is expected to grow between 7% and 11% compared to Q4 2023. 

Also, the context includes information about free cash flow for the twelve months ended September 30, 2023, and 2024, but it does not provide any information about revenue for Q3 specifically.

However, we can infer some information from the free cash flow section. Since Q4 is the last quarter of the year, and the free cash flow is provided for the twelve months ended September 30, it may be possible to make a rough estimate of the revenue for Q3 by looking at the trend of free cash flow and other provided information, but we can't get an exact comparison between Q3 2024 and Q3 2023 revenue from the given context.

However the context has provided information about the  Net sales for the quarter (Q4 2024) expected to be between $181.5 bi

2025-11-09 21:32:22,326 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Unfortunately, we cannot determine the net income for Q3 2024 from the provided context.

However, based on the information given:

* For Q3 2023 and Q3 2024, the "Other operating expense (income), net" is $244 million and $262 million respectively.
* This implies that there is a year-over-year increase in "Other operating expense (income), net" of $18 million.
--------------------------------------------------

Question: What were the earnings per share (basic and diluted) in Q3 2024?


2025-11-09 21:32:22,826 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


* Unfortunately, the provided context does not include the earnings per share (basic and diluted) for Q3 2024.
* The context only includes the calculation of basic and diluted earnings per share for Q3 and 9 months 2023 and Q3 and 9 months 2024.
--------------------------------------------------

Question: How much did Amazon earn from product vs. service sales?


2025-11-09 21:32:23,474 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Unfortunately, the provided context does not specifically state Amazon's earnings from product vs. service sales. However, we can make an inference based on the given information.

- Product sales represent revenue from the sale of products and related shipping fees and digital media content.
- Service sales primarily represent third-party seller fees, AWS sales, advertising services, Amazon Prime membership fees, and certain digital media content subscriptions.

We can infer that product sales include revenue from the sale of products, while service sales include revenue from other non-product related services.

However, we cannot determine the exact amount or percentage of product vs. service sales earnings from the provided context.
--------------------------------------------------

Question: What were the main operating expense categories and their values in Q3 2024?


2025-11-09 21:32:23,990 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


* Other operating expense (income), net: 
  - Q3 2023: $262 million
  - Q3 2024: $244 million
--------------------------------------------------

Question: What was Amazon‚Äôs operating income in Q3 2024?


2025-11-09 21:32:24,292 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


* Other operating expense (income), net was $244 million for Q3 2023 and 
* $262 million for Q3 2024.
--------------------------------------------------



In [49]:
# Main execution logic
if __name__ == "__main__":
    # Load document
    source = "amazon-10-q-q3-2024.pdf"
    markdown_content = load_and_convert_document(source)
    chunks = get_markdown_splits(markdown_content)

    # Create vector store
    vector_store = setup_vector_store(chunks)

    # Setup retriever
    retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 3})

    # Create RAG chain
    rag_chain = create_rag_chain(retriever)

    # Questions for retrieval
    # Questions for retrieval (Amazon-specific)
    questions = [
    # Financial Performance
    "What was Amazon‚Äôs total revenue in Q3 2024?",
    "How does the revenue in Q3 2024 compare to Q3 2023?",
    "What was the net income for Q3 2024, and how does it compare year over year?",
    "What were the earnings per share (basic and diluted) in Q3 2024?",
    "How much did Amazon earn from product vs. service sales?",
    "What were the main operating expense categories and their values in Q3 2024?",
    "What was Amazon‚Äôs operating income in Q3 2024?",
    ]

    # Answer questions
    for question in questions:
        print(f"Question: {question}")
        for chunk in rag_chain.stream(question):
            print(chunk, end="", flush=True)
        print("\n" + "-" * 50 + "\n")

2025-11-09 21:32:37,933 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-09 21:32:37,967 - INFO - Going to convert document batch...
2025-11-09 21:32:37,967 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f9730ffaa6e7f8d4fb0c98c8df3f18cb
2025-11-09 21:32:37,967 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-09 21:32:38,003 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-09 21:32:38,013 [RapidOCR] download_file.py:60: File exists and is valid: C:\tenserflow project\tfvenv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-09 21:32:38,013 [RapidOCR] main.py:53: Using C:\tenserflow project\tfvenv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-09 21:32:38,357 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-09 21:32:38,372 [RapidOCR] download_file.py:60: File exists and is valid: C:\tenserflow project\tfvenv\Lib\sit

Question: What was Amazon‚Äôs total revenue in Q3 2024?


2025-11-09 21:44:27,125 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


* I don't know the total revenue for Amazon in Q3 2024 from the provided context.
--------------------------------------------------

Question: How does the revenue in Q3 2024 compare to Q3 2023?


2025-11-09 21:44:27,410 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Unfortunately, there is no information provided about Q3 2024 revenue in the given context. However, there is information provided about Q3 2024 general and administrative costs, and free cash flow for the twelve months ended September 30, 2023 and 2024.

* There is no information about Q3 2024 revenue in the given context.
* General and administrative costs in Q3 2024 did not significantly change compared to the comparable prior year period.
* Free cash flow for the twelve months ended September 30, 2024 is $47,747 million, compared to $21,434 million for the twelve months ended September 30, 2023.
--------------------------------------------------

Question: What was the net income for Q3 2024, and how does it compare year over year?


2025-11-09 21:44:27,837 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


* I do not have enough information in the provided context to calculate the net income for Q3 2024.
* The Other Operating Expense (Income), Net for Q3 2023 and Q3 2024 were $262 million and $244 million, respectively.
* To compare year over year, the Other Operating Expense (Income), Net decreased by $18 million ($262 million - $244 million) from Q3 2023 to Q3 2024.
--------------------------------------------------

Question: What were the earnings per share (basic and diluted) in Q3 2024?


2025-11-09 21:44:28,217 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Unfortunately, the provided context does not contain the information about the earnings per share (basic and diluted) in Q3 2024.
--------------------------------------------------

Question: How much did Amazon earn from product vs. service sales?


2025-11-09 21:44:28,514 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Unfortunately, the provided context does not specify the exact amounts of earnings from product versus service sales. However, it does provide the following relevant information:

- Product sales represent revenue from the sale of products and related shipping fees and digital media content where we record revenue gross.
- Service sales primarily represent third-party seller fees, which includes commissions and any related fulfillment and shipping fees, AWS sales, advertising services, Amazon Prime membership fees, and certain digital media content subscriptions.

Unfortunately, earnings are not mentioned in the context.
--------------------------------------------------

Question: What were the main operating expense categories and their values in Q3 2024?


2025-11-09 21:44:28,892 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Based on the retrieved context, here are the main operating expense categories and their values in Q3 2024:

* Other operating expense (income), net: $262 million
--------------------------------------------------

Question: What was Amazon‚Äôs operating income in Q3 2024?


2025-11-09 21:44:29,095 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


* I don't know the answer to what Amazon's operating income in Q3 2024 was. The given context only provides information about other operating expense (income), net and sales and marketing costs in Q3 2024, but it does not provide the operating income.
--------------------------------------------------



In [55]:
import gradio as gr

# Global variable to store vector store and RAG chain after file upload
vector_store = None
rag_chain = None

def upload_and_process(file):
    global vector_store, rag_chain
    file_path = file.name
    markdown_content = load_and_convert_document(file_path)
    chunks = get_markdown_splits(markdown_content)
    vector_store = setup_vector_store(chunks)
    retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 3})
    rag_chain = create_rag_chain(retriever)
    return "‚úÖ Document processed and RAG chain created. You can now ask questions."

def ask_question(question):
    if rag_chain is None:
        return "‚ùå Please upload and process a document first."
    response = ""
    for chunk in rag_chain.stream(question):
        response += chunk
    return response

# Launch Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## üìä Amazon 10-Q RAG-Based QA System using Groq + FAISS")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="üìÅ Upload Amazon 10-Q PDF", file_types=[".pdf"])
            upload_btn = gr.Button("üîÑ Upload & Process")
            upload_output = gr.Textbox(label="Processing Status", lines=2)

        with gr.Column():
            question_input = gr.Textbox(label="‚ùì Ask a Question", placeholder="e.g. What was Amazon‚Äôs total revenue in Q3 2024?")
            ask_btn = gr.Button("üîç Get Answer")
            answer_output = gr.Textbox(label="üì¢ Answer", lines=10)

    upload_btn.click(fn=upload_and_process, inputs=[file_input], outputs=[upload_output])
    ask_btn.click(fn=ask_question, inputs=[question_input], outputs=[answer_output])

demo.launch()

2025-11-09 21:52:35,900 - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
2025-11-09 21:52:35,997 - INFO - HTTP Request: GET http://127.0.0.1:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
2025-11-09 21:52:36,064 - INFO - HTTP Request: HEAD http://127.0.0.1:7860/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




2025-11-09 21:56:46,685 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-09 21:56:46,806 - INFO - Going to convert document batch...
2025-11-09 21:56:46,810 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f9730ffaa6e7f8d4fb0c98c8df3f18cb
2025-11-09 21:56:46,842 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-09 21:56:46,976 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-09 21:56:47,025 [RapidOCR] download_file.py:60: File exists and is valid: C:\tenserflow project\tfvenv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-09 21:56:47,036 [RapidOCR] main.py:53: Using C:\tenserflow project\tfvenv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-09 21:56:47,888 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-09 21:56:47,901 [RapidOCR] download_file.py:60: File exists and is valid: C:\tenserflow project\tfvenv\Lib\sit

In [54]:
pip install gradio

Note: you may need to restart the kernel to use updated packages.


