In [11]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import fitz
import re
import json
import matplotlib.pyplot as plt
from langchain.agents import initialize_agent, tool
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
load_dotenv()

llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

PART A

In [5]:

@tool
def extract_revenue_growth(text: str) -> dict:
    """
    Uses Gemini to extract revenue and growth-related information from a financial document.
    """

    prompt = f"""
                You are a financial analysis assistant. Given the following financial document, extract:

                1. Total revenue (yearly and quarterly)
                2. Revenue growth (YoY and QoQ)
                3. Revenue by business segment (if available)

                Return a structured JSON dictionary like this:
                {{
                "total_revenue_yearly": [...],
                "total_revenue_quarterly": [...],
                "revenue_growth_yoy": [...],
                "revenue_growth_qoq": [...],
                "revenue_by_segment": {{
                    "segment_name1": "amount",
                    ...
                }}
                }}

                Financial document:
                \"\"\"
                {text}
                \"\"\"
            """

    response = llm.invoke(prompt)
    return response


In [6]:
@tool
def extract_profitability_metrics(text: str) -> dict:
    """
    Extracts key profitability metrics from a financial document:
    - Gross Profit and Gross Margin
    - Operating Profit (EBIT) and Margin
    - Net Profit (PAT) and Margin
    - EBITDA and Adjusted EBITDA
    """
    prompt = f"""
                You are a financial analyst. Extract the following profitability metrics from the text below:
                1. Gross Profit
                2. Gross Margin
                3. Operating Profit (EBIT)
                4. Operating Margin
                5. Net Profit (PAT)
                6. Net Margin
                7. EBITDA
                8. Adjusted EBITDA

                Return the result as a JSON dictionary with the metrics as keys.

                Text:
                {text}
            """
    response = llm.invoke(prompt)
    return response


In [7]:

@tool
def expense_breakdown_tool(text: str) -> str:
    """
    Extracts and summarizes the Expense Breakdown from the financial document text,
    specifically focusing on:
    - Cost of Goods Sold (COGS)
    - Operating Expenses (SG&A, R&D, marketing)
    - Interest expense, depreciation & amortization
    
    Args:
        text: The extracted text from the financial document.
    
    Returns:
        A concise, structured summary of the expense breakdown.
    """
    prompt = f"""
You are a financial analyst assistant. Extract and summarize the expense breakdown from the following text.
Focus only on these expense categories:
- Cost of Goods Sold (COGS)
- Operating expenses (including SG&A, R&D, marketing)
- Interest expense, depreciation & amortization

Provide the results in a clear bullet point format with any values or qualitative info you find.

Text:
\"\"\"{text}\"\"\"
"""
    response = llm.predict(prompt)
    return response.strip()


In [8]:

@tool
def extract_cash_flow_info(text: str) -> str:
    """
    Extracts and summarizes key cash flow information from the given financial document text:
    - Operating cash flow
    - Investing and financing cash flows
    - Free cash flow
    - Cash burn rate (if applicable)
    - Runway (in months)

    Args:
        text (str): Extracted text from financial document.

    Returns:
        str: A concise summary of the cash flow info as above.
    """
    prompt = f"""
                You are a financial analyst assistant. Extract and summarize the following cash flow information from the text below:

                1. Operating cash flow
                2. Investing and financing cash flows
                3. Free cash flow
                4. Cash burn rate (if mentioned)
                5. Runway in months (if mentioned or can be calculated)

                Provide the result as a clear, structured summary.

                Text:
                \"\"\"
                {text}
                \"\"\"
            """

    response = llm.predict(prompt)
    return response


In [9]:

@tool
def extract_balance_sheet_highlights(text: str) -> str:
    """
    Extract key balance sheet highlights from the provided financial document text,
    including total assets and liabilities, cash and equivalents, debt (short-term and long-term),
    working capital, and net worth/shareholder equity.
    """
    prompt = f"""
                You are a financial analyst. Extract and summarize the following balance sheet highlights from the text below:

                - Total assets and liabilities
                - Cash and equivalents
                - Debt (short-term and long-term)
                - Working capital
                - Net worth / shareholder equity

                Provide the extracted values and short explanations if available.

                Text:
                \"\"\"{text}\"\"\"
            """

    response = llm.invoke(prompt)
    return response.content


In [None]:

@tool
def extract_funding_and_valuation(text: str) -> str:
    """
    Extracts key funding and valuation details from a financial document text:
    - Most recent valuation (if disclosed)
    - Total capital raised
    - Basic cap table information

    Args:
        text: The full extracted text of the financial document.

    Returns:
        A concise summary string containing the requested funding and valuation info.
    """
    prompt = f"""
                You are a financial analyst. Extract from the following text the most recent valuation (if disclosed), total capital raised, and basic cap table information.
                If any info is missing, say 'Not disclosed'.

                Text:
                {text}

                Provide the information clearly with labels.
            """
    response = llm.invoke(prompt)
    return response


In [None]:

def llm_extract_value(financial_text: str, target_name: str):
    prompt = f"""
        You are a helpful assistant extracting financial values from a financial document.

        Extract the numeric value (float) for the item named "{target_name}" from the following financial text.
        If the value is not explicitly found, respond with "NONE".

        Financial Text:
        \"\"\"{financial_text}\"\"\"
            
        Answer with only the number or NONE.
    """
    template = PromptTemplate(input_variables=["financial_text"], template=prompt)
    chain = LLMChain(llm=llm, prompt=template)
    response = chain.run(financial_text=financial_text)
    response = response.strip()
    if response.upper() == "NONE":
        return None
    try:
        return float(response.replace(',', ''))
    except Exception:
        return None


@tool
def calculate_financial_ratios(
    financial_text: str,
    total_revenue: float = None,
    gross_profit: float = None,
    operating_profit: float = None,
    net_profit: float = None,
    cash_burn_rate: float = None,
    total_assets: float = None,
    total_liabilities: float = None,
    short_term_debt: float = None,
    long_term_debt: float = None,
    working_capital: float = None,
    shareholder_equity: float = None,
    free_cash_flow: float = None,
) -> dict:
    """
    Calculate key financial ratios based on extracted values from a financial document.
    If an important input value is missing, attempt to extract it from financial_text using LLM.
    If still missing, assign None.

    Returns:
    {
        "current_ratio": float or None,
        "quick_ratio": float or None,
        "debt_to_equity_ratio": float or None,
        "gross_margin": float or None,
        "operating_margin": float or None,
        "net_margin": float or None,
        "roe": float or None,
        "roa": float or None,
        "burn_multiple": float or None
    }
    """

    important_fields = {
        "total_revenue": total_revenue,
        "gross_profit": gross_profit,
        "operating_profit": operating_profit,
        "net_profit": net_profit,
        "cash_burn_rate": cash_burn_rate,
        "total_assets": total_assets,
        "total_liabilities": total_liabilities,
        "short_term_debt": short_term_debt,
        "long_term_debt": long_term_debt,
        "working_capital": working_capital,
        "shareholder_equity": shareholder_equity,
        "free_cash_flow": free_cash_flow,
    }

    for key, val in important_fields.items():
        if val is None:
            extracted_val = llm_extract_value(financial_text, key.replace('_', ' '))
            important_fields[key] = extracted_val

    total_revenue = important_fields["total_revenue"]
    gross_profit = important_fields["gross_profit"]
    operating_profit = important_fields["operating_profit"]
    net_profit = important_fields["net_profit"]
    cash_burn_rate = important_fields["cash_burn_rate"]
    total_assets = important_fields["total_assets"]
    total_liabilities = important_fields["total_liabilities"]
    short_term_debt = important_fields["short_term_debt"]
    long_term_debt = important_fields["long_term_debt"]
    working_capital = important_fields["working_capital"]
    shareholder_equity = important_fields["shareholder_equity"]
    free_cash_flow = important_fields["free_cash_flow"]

    current_liabilities = None
    if total_liabilities is not None and long_term_debt is not None:
        current_liabilities = total_liabilities - long_term_debt
    current_assets = None
    if working_capital is not None and current_liabilities is not None:
        current_assets = working_capital + current_liabilities

    current_ratio = None
    if current_assets is not None and current_liabilities and current_liabilities != 0:
        current_ratio = current_assets / current_liabilities

    quick_ratio = None  

    total_debt = 0
    if short_term_debt is not None:
        total_debt += short_term_debt
    if long_term_debt is not None:
        total_debt += long_term_debt
    debt_to_equity_ratio = None
    if shareholder_equity is not None and shareholder_equity != 0:
        debt_to_equity_ratio = total_debt / shareholder_equity

    gross_margin = None
    operating_margin = None
    net_margin = None
    if total_revenue and total_revenue != 0:
        if gross_profit is not None:
            gross_margin = (gross_profit / total_revenue) * 100
        if operating_profit is not None:
            operating_margin = (operating_profit / total_revenue) * 100
        if net_profit is not None:
            net_margin = (net_profit / total_revenue) * 100

    roe = None
    if shareholder_equity and shareholder_equity != 0 and net_profit is not None:
        roe = (net_profit / shareholder_equity) * 100

    roa = None
    if total_assets and total_assets != 0 and net_profit is not None:
        roa = (net_profit / total_assets) * 100

    burn_multiple = None
    if cash_burn_rate is not None and free_cash_flow is not None and free_cash_flow != 0:
        burn_multiple = abs(cash_burn_rate) / abs(free_cash_flow)

    return {
        "current_ratio": current_ratio,
        "quick_ratio": quick_ratio,
        "debt_to_equity_ratio": debt_to_equity_ratio,
        "gross_margin": gross_margin,
        "operating_margin": operating_margin,
        "net_margin": net_margin,
        "roe": roe,
        "roa": roa,
        "burn_multiple": burn_multiple,
    }


In [None]:
{
  "total_revenue": {
    "yearly": None,
    "quarterly": None
  },
  "revenue_growth": {
    "YoY": None,
    "QoQ": None
  },
  "revenue_by_business_segment": {
    "segment_1": None,
    "segment_2": None,
    "segment_3": None
  },
  "gross_profit": {
    "value": None,
    "margin_percent": None
  },
  "operating_profit": {
    "EBIT": None,
    "margin_percent": None
  },
  "net_profit": {
    "PAT": None,
    "margin_percent": None
  },
  "EBITDA": {
    "value": None,
    "adjusted_value": None
  },
  "cost_of_goods_sold": None,
  "operating_expenses": {
    "SG&A": None,
    "R&D": None,
    "marketing": None,
    "total": None
  },
  "interest_expense": None,
  "depreciation_and_amortization": None,
  "operating_cash_flow": None,
  "investing_cash_flow": None,
  "financing_cash_flow": None,
  "free_cash_flow": None,
  "cash_burn_rate": None,
  "runway_months": None,
  "total_assets": None,
  "total_liabilities": None,
  "cash_and_equivalents": None,
  "debt": {
    "short_term": None,
    "long_term": None,
    "total": None
  },
  "working_capital": None,
  "net_worth_shareholder_equity": None,
  "most_recent_valuation": None,
  "total_capital_raised": None,
  "cap_table": {
    "basic_level": None
  },
  "liquidity_ratios": {
    "current_ratio": None,
    "quick_ratio": None
  },
  "debt_to_equity_ratio": None,
  "margins": {
    "gross_margin_percent": None,
    "operating_margin_percent": None,
    "net_margin_percent": None
  },
  "returns": {
    "return_on_equity_ROE_percent": None,
    "return_on_assets_ROA_percent": None
  },
  "burn_multiple": None
}
