In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
from dotenv import load_dotenv
import os
load_dotenv()

class SubQuery(BaseModel):
    id: str = Field(description="Unique sub-query identifier, e.g., Q1, Q2")
    question: str = Field(description="Atomic financial question for retrieval")
    source_statement: str = Field(
        description="Income Statement | Balance Sheet | Cash Flow Statement | Derived"
    )
    dependency: List[str] = Field(
        description="List of sub-query IDs this question depends on"
    )


class DecomposedQuery(BaseModel):
    original_query: str
    sub_queries: List[SubQuery]

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0.7,
    api_key=os.getenv("GOOGLE_API_KEY"),
)

parser = PydanticOutputParser(pydantic_object=DecomposedQuery)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
You are a senior financial analysis assistant specializing in credit analysis and financial statement modeling.

Your task is to decompose complex financial analysis questions into minimal, independent, retrieval-ready sub-queries.

Rules:
1. Do NOT answer the question.
2. Do NOT calculate values.
3. Only generate questions that can be answered from financial statements.
4. Explicitly separate direct vs inferred data needs.
5. Preserve logical dependencies.
6. Use precise financial terminology.
7. Output MUST strictly follow the provided JSON schema.
"""
        ),
        (
            "human",
            """
User Query:
{query}

{format_instructions}
"""
        ),
    ]
)

decomposition_chain = prompt | llm | parser


In [8]:
user_query = """
Using the provided financial statements, derive the company’s Operating Cash Flow, even if it is not explicitly reported.
Instructions:
Identify relevant line items from the income statement, balance sheet, and cash flow statement.
If Operating Cash Flow is not directly available, approximate using:
Operating Cash Flow = EBITDA – Taxes Paid ± Change in Working Capital
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))


{
  "original_query": "Using the provided financial statements, derive the company’s Operating Cash Flow, even if it is not explicitly reported.\nInstructions:\nIdentify relevant line items from the income statement, balance sheet, and cash flow statement.\nIf Operating Cash Flow is not directly available, approximate using:\nOperating Cash Flow = EBITDA – Taxes Paid ± Change in Working Capital",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What is the Net Income reported on the Income Statement?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q2",
      "question": "What is the Depreciation and Amortization expense reported on the Income Statement?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q3",
      "question": "What is the Interest Expense reported on the Income Statement?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "i

In [9]:
user_query = """
Calculate the Debt Service Coverage Ratio (DSCR) using disclosed and inferred data.
Instructions:
Use:
DSCR = Operating Cash Flow / (Interest Paid + Principal Repayments)
If principal repayments are not explicitly disclosed:
Infer from changes in borrowings year on year
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))


{
  "original_query": "Calculate the Debt Service Coverage Ratio (DSCR) using disclosed and inferred data.\nInstructions:\nUse:\nDSCR = Operating Cash Flow / (Interest Paid + Principal Repayments)\nIf principal repayments are not explicitly disclosed:\nInfer from changes in borrowings year on year",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What is the Operating Cash Flow for the period?",
      "source_statement": "Cash Flow Statement",
      "dependency": []
    },
    {
      "id": "Q2",
      "question": "What is the total Interest Paid for the period?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q3",
      "question": "What is the total Principal Repayments for the period?",
      "source_statement": "Cash Flow Statement",
      "dependency": []
    },
    {
      "id": "Q4",
      "question": "What is the total amount of Borrowings at the beginning of the period?",
      "source_statement": "Balance Sheet",
 

In [11]:
user_query = """
Using the provided financial statements, derive the company’s Free Cash Flow, even if it is not explicitly reported.
Instructions:
Identify Operating Cash Flow, derive it if required.
Identify Capital Expenditure from the cash flow statement or movement in property, plant and equipment.
If Capital Expenditure is not explicitly available, infer using additions to fixed assets.
Use:
Free Cash Flow = Operating Cash Flow – Capital Expenditure
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))


{
  "original_query": "Using the provided financial statements, derive the company’s Free Cash Flow, even if it is not explicitly reported. Instructions: Identify Operating Cash Flow, derive it if required. Identify Capital Expenditure from the cash flow statement or movement in property, plant and equipment. If Capital Expenditure is not explicitly available, infer using additions to fixed assets. Use: Free Cash Flow = Operating Cash Flow – Capital Expenditure",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What is the net cash provided by operating activities?",
      "source_statement": "Cash Flow Statement",
      "dependency": []
    },
    {
      "id": "Q2",
      "question": "What is the amount of purchases of property, plant and equipment?",
      "source_statement": "Cash Flow Statement",
      "dependency": []
    },
    {
      "id": "Q3",
      "question": "What was the beginning balance of Property, Plant, and Equipment, Net?",
      "source_statement": "B

In [14]:
user_query = """
Calculate the Interest Coverage Ratio using disclosed and inferred data.
Instructions:
Use:
Interest Coverage Ratio = EBITDA / Interest Expense
If interest expense is not explicitly disclosed:
Infer from finance costs or borrowing related expenses
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))


{
  "original_query": "Calculate the Interest Coverage Ratio using disclosed and inferred data.\nInstructions:\nUse:\nInterest Coverage Ratio = EBITDA / Interest Expense\nIf interest expense is not explicitly disclosed:\nInfer from finance costs or borrowing related expenses",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What is the value of EBITDA (Earnings Before Interest, Taxes, Depreciation, and Amortization)?",
      "source_statement": "Derived",
      "dependency": []
    },
    {
      "id": "Q2",
      "question": "What is the value of Interest Expense disclosed on the Income Statement?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q3",
      "question": "What is the value of Finance Costs disclosed on the Income Statement?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q4",
      "question": "What is the value of Borrowing Related Expenses disclosed on the Inc

In [15]:
user_query = """
Derive the company’s Net Debt and leverage position, even if not explicitly stated.
Instructions:
Identify total short term and long term borrowings.
Identify cash and cash equivalents.
Use:
Net Debt = Total Borrowings – Cash and Cash Equivalents
Derive EBITDA if not directly available.
Compute:
Net Debt to EBITDA
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))


{
  "original_query": "Derive the company’s Net Debt and leverage position, even if not explicitly stated. Identify total short term and long term borrowings. Identify cash and cash equivalents. Use: Net Debt = Total Borrowings – Cash and Cash Equivalents Derive EBITDA if not directly available. Compute: Net Debt to EBITDA",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What is the total amount of short-term borrowings?",
      "source_statement": "Balance Sheet",
      "dependency": []
    },
    {
      "id": "Q2",
      "question": "What is the total amount of long-term borrowings?",
      "source_statement": "Balance Sheet",
      "dependency": []
    },
    {
      "id": "Q3",
      "question": "What is the amount of cash and cash equivalents?",
      "source_statement": "Balance Sheet",
      "dependency": []
    },
    {
      "id": "Q4",
      "question": "What is Earnings Before Interest, Taxes, Depreciation, and Amortization (EBITDA)?",
      "source_statement

In [16]:
user_query = """
Assess earnings quality using profit and cash flow data.
Instructions:
Derive EBITDA and Operating Cash Flow.
Compute:
Cash Conversion Ratio = Operating Cash Flow / EBITDA
Compare growth trends between EBITDA and Operating Cash Flow.
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))


{
  "original_query": "Assess earnings quality using profit and cash flow data. Derive EBITDA and Operating Cash Flow. Compute: Cash Conversion Ratio = Operating Cash Flow / EBITDA. Compare growth trends between EBITDA and Operating Cash Flow.",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What is the Net Income?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q2",
      "question": "What is the amount of Interest Expense?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q3",
      "question": "What is the amount of Income Tax Expense?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q4",
      "question": "What is the amount of Depreciation and Amortization expense?",
      "source_statement": "Income Statement",
      "dependency": []
    },
    {
      "id": "Q5",
      "question": "Derive EBITDA for the period.",
      "so

In [28]:
user_query = """
What sector(s) and sub-sector(s) does your company operate in?
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))

{
  "original_query": "What sector(s) and sub-sector(s) does your company operate in?",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What industry classification code (e.g., SIC, NAICS) is reported for the company?",
      "source_statement": "Derived",
      "dependency": []
    },
    {
      "id": "Q2",
      "question": "What is the company's primary industry classification?",
      "source_statement": "Derived",
      "dependency": [
        "Q1"
      ]
    },
    {
      "id": "Q3",
      "question": "What is the company's secondary industry classification, if any?",
      "source_statement": "Derived",
      "dependency": [
        "Q1"
      ]
    }
  ]
}


In [30]:
user_query = """
Do you have a working prototype or an MVP at the moment? When was it launched?
"""

result = decomposition_chain.invoke(
        {
            "query": user_query,
            "format_instructions": parser.get_format_instructions(),
        }
    )

print(result.model_dump_json(indent=2))

{
  "original_query": "Do you have a working prototype or an MVP at the moment? When was it launched?",
  "sub_queries": [
    {
      "id": "Q1",
      "question": "What is the reported date of the first commercial product launch or significant operational milestone that could represent an MVP or prototype?",
      "source_statement": "Derived",
      "dependency": []
    }
  ]
}
