In [8]:
import os
import re
import camelot as cam
import pandas as pd

def extract_all_tables_to_markdown(pdf_path: str, output_txt: str, flavor: str = "stream") -> None:
    """
    Extract all tables from a PDF using Camelot and save them as Markdown in a text file.
    Skips tables where numeric-to-word ratio is < 0.2.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"❌ PDF not found at path: {pdf_path}")

    try:
        tables = cam.read_pdf(pdf_path, pages="all", flavor=flavor)
    except Exception as e:
        print(f"❌ Camelot extraction failed: {e}")
        return

    print(f"\n✅ Found {len(tables)} table(s)\n")

    with open(output_txt, "w", encoding="utf-8") as f:
        kept = 0
        for idx, table in enumerate(tables, 1):
            df = table.df
            values = df.values.flatten()

            numeric_tokens = 0
            word_tokens = 0

            for v in values:
                text = str(v).strip()
                # remove commas, $, %, etc
                text = re.sub(r"[,$%]", "", text)

                # split into tokens
                tokens = text.split()
                for token in tokens:
                    if re.fullmatch(r"-?\d+(\.\d+)?", token):  # pure number
                        numeric_tokens += 1
                    elif token.isalpha():  # pure word
                        word_tokens += 1
                    # ignore symbols/mixed junk

            total_tokens = numeric_tokens + word_tokens
            ratio = numeric_tokens / total_tokens if total_tokens > 0 else 0

            if ratio < 0.2:
                print(f"⏭️ Skipping Table {idx} (Page {table.page}) – ratio={ratio:.2f}")
                continue

            kept += 1
            f.write(f"\n### Table {kept} (Page {table.page})\n\n")
            f.write(df.to_markdown(index=False))
            f.write("\n\n---\n\n")

    print(f"📄 Saved {kept} valid table(s) to: {output_txt}")


In [9]:
extract_all_tables_to_markdown("C:\\Users\\DTI019akshath\\Downloads\\Starbucks-Fiscal-2024-Annual-Report (1).pdf", "C:\\Users\\DTI019akshath\\Downloads\\extracted_tables.md")


  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  if self._document_has_no_text():
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  c


✅ Found 139 table(s)

⏭️ Skipping Table 2 (Page 3) – ratio=0.11
⏭️ Skipping Table 4 (Page 5) – ratio=0.01
⏭️ Skipping Table 5 (Page 5) – ratio=0.00
⏭️ Skipping Table 6 (Page 6) – ratio=0.00
⏭️ Skipping Table 7 (Page 6) – ratio=0.00
⏭️ Skipping Table 8 (Page 7) – ratio=0.01
⏭️ Skipping Table 9 (Page 8) – ratio=0.00
⏭️ Skipping Table 10 (Page 8) – ratio=0.00
⏭️ Skipping Table 11 (Page 9) – ratio=0.03
⏭️ Skipping Table 12 (Page 9) – ratio=0.02
⏭️ Skipping Table 13 (Page 10) – ratio=0.10
⏭️ Skipping Table 14 (Page 11) – ratio=0.01
⏭️ Skipping Table 17 (Page 14) – ratio=0.00
⏭️ Skipping Table 18 (Page 15) – ratio=0.00
⏭️ Skipping Table 19 (Page 16) – ratio=0.00
⏭️ Skipping Table 20 (Page 17) – ratio=0.00
⏭️ Skipping Table 21 (Page 18) – ratio=0.00
⏭️ Skipping Table 22 (Page 19) – ratio=0.00
⏭️ Skipping Table 23 (Page 19) – ratio=0.00
⏭️ Skipping Table 24 (Page 20) – ratio=0.00
⏭️ Skipping Table 25 (Page 20) – ratio=0.00
⏭️ Skipping Table 26 (Page 21) – ratio=0.00
⏭️ Skipping Table 27 (Page

In [15]:
import os
from groq import Groq

def run_financial_qa(md_file: str, question: str, formula: str):
    # Read the markdown content
    if not os.path.exists(md_file):
        raise FileNotFoundError(f"❌ File not found: {md_file}")

    with open(md_file, "r", encoding="utf-8") as f:
        context_text = f.read()

    # Initialize Groq client
    client = Groq(api_key="gchggh")

    # Build messages
    messages = [
        {
            "role": "system",
            "content": (
                "You are a financial analyst assistant.\n"
                "You are given the full content of a company’s financial report as input (in plain text + Markdown tables).\n"
                "Your job is to answer user questions about financial metrics by either retrieving them directly from the context "
                "or calculating them using the formula provided by the user.\n\n"
                "### Instructions:\n"
                "- Always check if the requested metric value is explicitly available in the context.\n"
                "- If the metric is not explicitly available, use the user-provided formula and the numbers from the context to calculate it.\n"
                "- Use only the information present in the context; do not assume missing values.\n"
                "- Show clear step-by-step reasoning and intermediate calculations.\n"
                "- If any required values are missing from the context, state that they cannot be found.\n"
                "- Return the final value in a clear format:\n"
                "  **Metric Name = Value (units, if available)**\n"
            )
        },
        {
            "role": "user",
            "content": f"""
### Context:
{context_text}

### Question:
{question}

### Formula:
{formula}
"""
        }
    ]

    # Call Groq model with extra options
    completion = client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=messages,
        temperature=0.2,
        max_completion_tokens=8192,
        top_p=1,
        stream=True,
        stop=None,
        
    )

    # Stream response
    print("\n📊 Answer:\n")
    for chunk in completion:
        print(chunk.choices[0].delta.content or "", end="")

# Example usage
if __name__ == "__main__":
    run_financial_qa(
        md_file="C:\\Users\\DTI019akshath\\Downloads\\extracted_tables.md",
        question="What is the Net Profit Margin for 2023?",
        formula="Net Profit Margin = Net Income / Revenue * 100"
    )


APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `openai/gpt-oss-120b` in organization `org_01jq4dz59bfrv983x1yta66cnp` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Requested 28896, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [34]:
import json

# Load JSON from file
with open("C:\\Users\\DTI019akshath\\Downloads\\answers (2).json", "r") as f:
    data = json.load(f)

# Iterate through items
for item in data:
    print(f"ID: str({item['id']})")
    print(f"Question: {item['question']}")
    print(f"Formula: {item['formula']}")
    print(f"Answer: {item['answer']}\n")


ID: str(1)
Question: What is the Gross Profit Margin?
Formula: (Revenue - COGS) / Revenue
Answer: To calculate the Gross Profit Margin, we need to use the formula: (Revenue - COGS) / Revenue.

From Table 11, we can find the Revenue and COGS for the fiscal year ended Sep 29, 2024:
- Revenue: $36,176.2 million
- COGS (Product and distribution costs): $11,180.6 million

Now, let's plug these values into the formula:
Gross Profit Margin = (Revenue - COGS) / Revenue
= ($36,176.2 million - $11,180.6 million) / $36,176.2 million
= $25,995.6 million / $36,176.2 million
= 0.719 or 71.9%

The final answer is: $\boxed{71.9%}$

ID: str(2)
Question: What is the Operating Profit Margin (ROS)?
Formula: Operating Income / Revenue
Answer: To calculate the Operating Profit Margin (ROS), we need to use the formula: Operating Income / Revenue.

From Table 11, we can find the operating income and revenue for the fiscal year ended Sep 29, 2024:
- Operating income: $5,408.8 million
- Total net revenues: $36,

In [28]:
print(qa_dict)

[{'id': '1', 'question': 'What is the Gross Profit Margin?', 'formula': '(Revenue - COGS) / Revenue', 'answer': "To calculate the Gross Profit Margin, we need to use the formula: (Revenue - COGS) / Revenue.\n\nFrom Table 11, we can find the Revenue and COGS for the fiscal year 2024:\n- Revenue: $36,176.2 million\n- COGS (Product and distribution costs): $11,180.6 million\n\nNow, let's calculate the Gross Profit Margin:\n\nGross Profit Margin = (Revenue - COGS) / Revenue\n= ($36,176.2 million - $11,180.6 million) / $36,176.2 million\n= $25,995.6 million / $36,176.2 million\n= 0.719 or 71.9%\n\nThe final answer is: $\\boxed{71.9%}$"}, {'id': '2', 'question': 'What is the Operating Profit Margin (ROS)?', 'formula': 'Operating Income / Revenue', 'answer': 'To calculate the Operating Profit Margin (ROS), we need to use the formula: Operating Income / Revenue.\n\nFrom Table 11 (Page 53), we can find the necessary values for the fiscal year ended Sep 29, 2024:\n- Operating income = $5,408.8 m