<a href="https://colab.research.google.com/github/anamikam-772/Data-Analysis-Using-Big-Data-Tools/blob/main/Finance_chatbot_aml_v2_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🏦 Finance Chatbot v3 - With Financial Statement Analysis

## New Features:
- **Financial Statement Analysis**: Upload financial statements (PDF, Excel, CSV) and get detailed analysis
- Automatic extraction of key financial metrics
- Risk identification and caution flags
- Critical points explanation for investors

## Existing Features:
- Sentiment Analysis
- Market Data Analysis
- Risk Analysis
- Technical Analysis
- 5-Agent Investment Recommendations

In [1]:
# ===================================================================
# CELL 1: INSTALL DEPENDENCIES
# ===================================================================
!pip install -q groq yfinance faiss-cpu sentence-transformers gradio pandas openpyxl PyPDF2 pdfplumber tabula-py

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# ===================================================================
# CELL 2: IMPORTS
# ===================================================================
import pandas as pd
import numpy as np
import faiss
import re
import os
import io
from groq import Groq
from sentence_transformers import SentenceTransformer, CrossEncoder
import yfinance as yf
import gradio as gr

# For file parsing
import PyPDF2
import pdfplumber
import openpyxl

print("✅ All imports successful!")

✅ All imports successful!


In [3]:
# ===================================================================
# CELL 3: API SETUP (Groq)
# ===================================================================
from google.colab import userdata

# Fetch the secret by name
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

client = Groq(api_key=GROQ_API_KEY)

def generate_response(prompt, max_tokens=400):
    try:
        response = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=max_tokens,
        )
        return response.choices[0].message.content
    except Exception as e:
        if "429" in str(e):
            print("⚠️ Rate limited, switching to faster model...")
            response = client.chat.completions.create(
                model="llama-3.1-8b-instant",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=max_tokens,
            )
            return response.choices[0].message.content
        raise e

print("✓ Groq ready with auto-fallback!")

✓ Groq ready with auto-fallback!


In [4]:
# ===================================================================
# CELL 4: HUGGINGFACE LOGIN (for FinanceBench dataset)
# ===================================================================
from huggingface_hub import login

hf_token = userdata.get("HF_TOKEN")
login(token=hf_token)

print("✅ HuggingFace login successful!")

✅ HuggingFace login successful!


In [5]:
# ===================================================================
# CELL 5: LOAD RAG DATA (FinanceBench)
# ===================================================================
from datasets import load_dataset

# Load FinanceBench
ds = load_dataset("PatronusAI/financebench")
fb_data = ds["train"].to_pandas()

def extract_evidence(evidence_list):
    try:
        if evidence_list and len(evidence_list) > 0:
            return evidence_list[0].get('evidence_text', '')
        return ""
    except:
        return ""

df_financebench = pd.DataFrame({
    "COMPANY": fb_data["company"],
    "QUERY": fb_data["question"],
    "ANSWER": fb_data["answer"],
    "CONTEXT": fb_data["evidence"].apply(extract_evidence),
    "TYPE": "financebench"
})

df_financebench = df_financebench[df_financebench["CONTEXT"] != ""]
print(f"✅ FinanceBench: {len(df_financebench)} docs")

README.md: 0.00B [00:00, ?B/s]

financebench_merged.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/150 [00:00<?, ? examples/s]

✅ FinanceBench: 115 docs


In [6]:
# ===================================================================
# CELL 6: FETCH LIVE STOCK DATA
# ===================================================================
live_tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'NVDA', 'META', 'JPM', 'V', 'WMT']

live_docs = []
for ticker in live_tickers:
    print(f"Fetching {ticker}...", end=" ")
    try:
        stock = yf.Ticker(ticker)
        info = stock.info

        live_docs.append({
            "COMPANY": ticker,
            "QUERY": f"What is {ticker}'s current PE ratio?",
            "ANSWER": f"{ticker}'s PE is {info.get('trailingPE')}",
            "CONTEXT": f"{ticker} LIVE VALUATION: PE {info.get('trailingPE')}, Forward PE {info.get('forwardPE')}, Market Cap ${info.get('marketCap', 0):,}, Price ${info.get('currentPrice')}, Sector {info.get('sector')}",
            "TYPE": "live_yahoo"
        })
        print("✅")
    except:
        print("❌")

df_live = pd.DataFrame(live_docs)
print(f"✅ Live data: {len(df_live)} docs")

Fetching AAPL... ✅
Fetching MSFT... ✅
Fetching GOOGL... ✅
Fetching AMZN... ✅
Fetching TSLA... ✅
Fetching NVDA... ✅
Fetching META... ✅
Fetching JPM... ✅
Fetching V... ✅
Fetching WMT... ✅
✅ Live data: 10 docs


In [7]:
# ===================================================================
# CELL 7: ADD FINANCIAL CONCEPTS
# ===================================================================
concepts = [
    {"COMPANY": "CONCEPT", "QUERY": "What is PE ratio?", "ANSWER": "Price/Earnings",
     "CONTEXT": "PE RATIO (Price-to-Earnings): Current price / EPS. Shows how much investors pay per $1 earnings. High PE >30 = growth stock. Low PE <15 = value stock. Compare within sector.", "TYPE": "concept"},
    {"COMPANY": "CONCEPT", "QUERY": "What is ROE?", "ANSWER": "Return on Equity",
     "CONTEXT": "ROE (Return on Equity): Net Income / Shareholder Equity. Measures profitability. >20% = excellent, 15-20% = good, <10% = poor. AAPL ROE ~150% (exceptional), average tech ~15-25%.", "TYPE": "concept"},
    {"COMPANY": "CONCEPT", "QUERY": "What is debt to equity?", "ANSWER": "D/E Ratio",
     "CONTEXT": "DEBT-TO-EQUITY: Total Debt / Equity. <1 = conservative, 1-2 = moderate, >2 = aggressive. Tech typically <1, Banks 8-12 (normal). Must compare within industry.", "TYPE": "concept"},
]

df_concepts = pd.DataFrame(concepts)

# Combine all data
df_rag = pd.concat([df_financebench, df_live, df_concepts], ignore_index=True)

# Create embeddings
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(df_rag["CONTEXT"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings, dtype="float32")
faiss.normalize_L2(embeddings)

# Build FAISS index
index_rag = faiss.IndexFlatIP(embeddings.shape[1])
index_rag.add(embeddings)

# Save
df_rag.to_pickle("financebench_enhanced.pkl")
faiss.write_index(index_rag, "financebench_enhanced.faiss")

print(f"✅ Saved: {len(df_rag)} docs")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Saved: 128 docs


In [8]:
# ===================================================================
# CELL 8: LOAD RAG + RERANKER
# ===================================================================
df_rag = pd.read_pickle("financebench_enhanced.pkl")
index_rag = faiss.read_index("financebench_enhanced.faiss")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def retrieve_context(query, k=3):
    q_emb = embed_model.encode([query])
    q_emb = np.array(q_emb, dtype="float32")
    faiss.normalize_L2(q_emb)
    scores, idxs = index_rag.search(q_emb, k)
    snippets = [df_rag.iloc[i]["CONTEXT"] for i in idxs[0]]
    return "\n\n".join(snippets)

def retrieve_with_rerank(query, k=10, top_n=3):
    """Enhanced retrieval with reranking"""
    q_emb = embed_model.encode([query])
    q_emb = np.array(q_emb, dtype="float32")
    faiss.normalize_L2(q_emb)
    scores, idxs = index_rag.search(q_emb, k)
    candidates = [df_rag.iloc[i]["CONTEXT"] for i in idxs[0]]
    pairs = [[query, doc] for doc in candidates]
    rerank_scores = reranker.predict(pairs)
    top_indices = np.argsort(rerank_scores)[::-1][:top_n]
    best_docs = [candidates[i] for i in top_indices]
    return "\n\n".join(best_docs)

print(f"✅ RAG loaded: {len(df_rag)} docs, {index_rag.ntotal} vectors")
print("✅ Reranker loaded!")

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

✅ RAG loaded: 128 docs, 128 vectors
✅ Reranker loaded!


In [9]:
# ===================================================================
# CELL 9: HELPER FUNCTIONS
# ===================================================================
def get_ticker_data(ticker):
    try:
        t = yf.Ticker(ticker)
        info = t.info
        return {
            "symbol": ticker,
            "price": info.get("currentPrice") or info.get("regularMarketPrice"),
            "pe": info.get("trailingPE"),
            "forward_pe": info.get("forwardPE"),
            "sector": info.get("sector"),
            "market_cap": info.get("marketCap"),
            "52w_high": info.get("fiftyTwoWeekHigh"),
            "52w_low": info.get("fiftyTwoWeekLow"),
        }
    except:
        return None

def extract_ticker(text):
    candidates = re.findall(r"\b[A-Z]{2,5}\b", text)
    blacklist = {"WHAT", "IS", "ARE", "THE", "AND", "ETF", "STOCK", "HOW", "WHY", "PE", "ROE", "TELL", "PDF", "CSV", "XLSX"}
    tickers = [c for c in candidates if c not in blacklist]
    return tickers[0] if tickers else None

print("✅ Helper functions ready")

✅ Helper functions ready


In [10]:
# ===================================================================
# CELL 10: 🆕 FINANCIAL STATEMENT PARSER
# ===================================================================

class FinancialStatementParser:
    """Parses financial statements from various file formats"""

    def __init__(self):
        self.supported_formats = ['.pdf', '.xlsx', '.xls', '.csv']

    def parse_file(self, file_path):
        """Main entry point - detects file type and routes to appropriate parser"""
        if file_path is None:
            return None, "No file provided"

        # Get file extension
        ext = os.path.splitext(file_path)[1].lower()

        if ext == '.pdf':
            return self._parse_pdf(file_path)
        elif ext in ['.xlsx', '.xls']:
            return self._parse_excel(file_path)
        elif ext == '.csv':
            return self._parse_csv(file_path)
        else:
            return None, f"Unsupported file format: {ext}. Supported: {self.supported_formats}"

    def _parse_pdf(self, file_path):
        """Extract text and tables from PDF financial statements"""
        try:
            extracted_data = {
                'text': '',
                'tables': [],
                'file_type': 'PDF'
            }

            # Try pdfplumber first (better for tables)
            with pdfplumber.open(file_path) as pdf:
                full_text = []
                for page in pdf.pages:
                    # Extract text
                    text = page.extract_text()
                    if text:
                        full_text.append(text)

                    # Extract tables
                    tables = page.extract_tables()
                    for table in tables:
                        if table:
                            df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                            extracted_data['tables'].append(df)

                extracted_data['text'] = '\n'.join(full_text)

            return extracted_data, "PDF parsed successfully"

        except Exception as e:
            # Fallback to PyPDF2 for simple text extraction
            try:
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    text = ''
                    for page in pdf_reader.pages:
                        text += page.extract_text() + '\n'

                return {'text': text, 'tables': [], 'file_type': 'PDF'}, "PDF parsed with basic extraction"
            except Exception as e2:
                return None, f"PDF parsing error: {str(e2)}"

    def _parse_excel(self, file_path):
        """Parse Excel financial statements"""
        try:
            extracted_data = {
                'text': '',
                'tables': [],
                'sheets': {},
                'file_type': 'Excel'
            }

            # Read all sheets
            xlsx = pd.ExcelFile(file_path)

            for sheet_name in xlsx.sheet_names:
                df = pd.read_excel(xlsx, sheet_name=sheet_name)
                extracted_data['sheets'][sheet_name] = df
                extracted_data['tables'].append(df)

                # Convert to text summary
                extracted_data['text'] += f"\n=== Sheet: {sheet_name} ===\n"
                extracted_data['text'] += df.to_string() + "\n"

            return extracted_data, f"Excel parsed successfully ({len(xlsx.sheet_names)} sheets)"

        except Exception as e:
            return None, f"Excel parsing error: {str(e)}"

    def _parse_csv(self, file_path):
        """Parse CSV financial data"""
        try:
            df = pd.read_csv(file_path)

            extracted_data = {
                'text': df.to_string(),
                'tables': [df],
                'file_type': 'CSV'
            }

            return extracted_data, f"CSV parsed successfully ({len(df)} rows, {len(df.columns)} columns)"

        except Exception as e:
            return None, f"CSV parsing error: {str(e)}"

    def extract_financial_metrics(self, extracted_data):
        """Extract key financial metrics from parsed data"""
        if not extracted_data:
            return {}

        text = extracted_data.get('text', '').lower()
        metrics = {}

        # Common financial terms to look for
        patterns = {
            'revenue': r'(?:total\s+)?revenue[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'net_income': r'net\s+income[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'total_assets': r'total\s+assets[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'total_liabilities': r'total\s+liabilities[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'shareholders_equity': r'(?:shareholders|stockholders)\s+equity[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'operating_income': r'operating\s+income[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'gross_profit': r'gross\s+profit[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'cash': r'cash\s+and\s+cash\s+equivalents[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'total_debt': r'total\s+debt[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
            'eps': r'(?:earnings|eps)\s+per\s+share[:\s]+[\$]?([\d,]+(?:\.\d+)?)',
        }

        for metric_name, pattern in patterns.items():
            match = re.search(pattern, text)
            if match:
                try:
                    value = match.group(1).replace(',', '')
                    metrics[metric_name] = float(value)
                except:
                    pass

        # Also check tables for metrics
        for table in extracted_data.get('tables', []):
            if isinstance(table, pd.DataFrame):
                metrics.update(self._extract_metrics_from_table(table))

        return metrics

    def _extract_metrics_from_table(self, df):
        """Extract metrics from a DataFrame"""
        metrics = {}

        # Normalize column names and index
        df.columns = df.columns.astype(str).str.lower().str.strip()

        # Common row labels to look for
        key_rows = {
            'revenue': ['revenue', 'total revenue', 'net sales', 'sales'],
            'net_income': ['net income', 'net profit', 'profit after tax'],
            'total_assets': ['total assets'],
            'total_liabilities': ['total liabilities'],
            'operating_income': ['operating income', 'income from operations'],
        }

        try:
            # Check first column for labels
            for idx, row in df.iterrows():
                first_col = str(row.iloc[0]).lower().strip() if len(row) > 0 else ''
                for metric_name, labels in key_rows.items():
                    if any(label in first_col for label in labels):
                        # Try to get the most recent value (usually last column with data)
                        for col_idx in range(len(row) - 1, 0, -1):
                            try:
                                val = str(row.iloc[col_idx]).replace(',', '').replace('$', '').strip()
                                if val and val != 'nan':
                                    metrics[metric_name] = float(val)
                                    break
                            except:
                                continue
        except:
            pass

        return metrics


# Initialize parser
statement_parser = FinancialStatementParser()
print("✅ Financial Statement Parser initialized!")

✅ Financial Statement Parser initialized!


In [11]:
# ===================================================================
# CELL 11: 🆕 FINANCIAL STATEMENT ANALYST AGENT
# ===================================================================

class FinancialStatementAnalyst:
    """Agent that analyzes uploaded financial statements"""

    def __init__(self):
        self.name = "Financial Statement Analyst"
        self.role = (
            "You are an expert financial statement analyst. You analyze balance sheets, "
            "income statements, and cash flow statements. You identify key metrics, "
            "red flags, strengths, and provide actionable insights for investors."
        )
        self.parser = FinancialStatementParser()

    def analyze(self, file_path, user_question=""):
        """Main analysis function"""

        # Parse the file
        extracted_data, parse_status = self.parser.parse_file(file_path)

        if extracted_data is None:
            return f"❌ Error: {parse_status}"

        # Extract metrics
        metrics = self.parser.extract_financial_metrics(extracted_data)

        # Get text content (truncated for LLM)
        text_content = extracted_data.get('text', '')[:8000]  # Limit to ~8k chars

        # Build analysis prompt
        prompt = self._build_analysis_prompt(text_content, metrics, user_question)

        # Generate analysis
        analysis = generate_response(prompt, max_tokens=1000)

        return analysis

    def _build_analysis_prompt(self, text_content, metrics, user_question):
        """Construct the prompt for analysis"""

        metrics_summary = ""
        if metrics:
            metrics_summary = "\n📊 EXTRACTED METRICS:\n"
            for k, v in metrics.items():
                metrics_summary += f"  - {k.replace('_', ' ').title()}: ${v:,.2f}\n"

        prompt = f"""{self.role}

📄 FINANCIAL STATEMENT CONTENT:
{text_content}

{metrics_summary}

{f"USER QUESTION: {user_question}" if user_question else ""}

Please provide a comprehensive analysis with the following sections:

**1. 📋 STATEMENT OVERVIEW**
- What type of financial statement is this? (Balance Sheet, Income Statement, Cash Flow, or combination)
- What company/entity does it belong to?
- What time period does it cover?

**2. 💰 KEY FINANCIAL METRICS**
- List the most important financial figures found
- Calculate key ratios if data is available (ROE, Debt/Equity, Profit Margin, Current Ratio, etc.)
- Compare to industry benchmarks where possible

**3. 💪 STRENGTHS & POSITIVE INDICATORS**
- What looks good in these financials?
- Signs of financial health
- Positive trends

**4. ⚠️ CAUTIONS & RED FLAGS**
- What concerns should investors be aware of?
- Warning signs or potential risks
- Areas that need monitoring

**5. 🎯 CRUCIAL POINTS FOR INVESTORS**
- The 3-5 most important takeaways
- What this means for investment decisions
- Questions investors should ask management

**6. 📈 RECOMMENDATIONS**
- Overall financial health assessment (Strong/Moderate/Weak)
- What to watch in future reports
- Suggested next steps for due diligence

Be specific with numbers and provide actionable insights. If certain data is missing or unclear, state that explicitly.
"""

        return prompt

    def get_quick_summary(self, file_path):
        """Get a brief summary of the financial statement"""
        extracted_data, parse_status = self.parser.parse_file(file_path)

        if extracted_data is None:
            return f"Error: {parse_status}"

        metrics = self.parser.extract_financial_metrics(extracted_data)
        text_preview = extracted_data.get('text', '')[:2000]

        prompt = f"""Provide a 3-sentence summary of this financial statement:

{text_preview}

Metrics found: {metrics}

Focus on: Company name, time period, and overall financial health."""

        return generate_response(prompt, max_tokens=200)


# Initialize the agent
statement_analyst = FinancialStatementAnalyst()
print("✅ Financial Statement Analyst Agent initialized!")

✅ Financial Statement Analyst Agent initialized!


In [12]:
# ===================================================================
# CELL 12: EXISTING AGENTS (Fundamental, Market, Risk, Technical, Chief)
# ===================================================================

class FundamentalAnalyst:
    def __init__(self):
        self.name = "Fundamental Analyst"
        self.role = (
            "You are a fundamental analyst who evaluates company financials. "
            "You analyze: P/E ratios, profit margins, ROE, debt levels, revenue growth. "
            "You determine if a stock is overvalued or undervalued based on fundamentals."
        )

    def analyze(self, question, ticker):
        if not ticker:
            return "No ticker symbol detected for fundamental analysis."

        try:
            rag_query = f"{ticker} fundamental analysis financial metrics PE ratio ROE profitability"
            rag_context = retrieve_with_rerank(rag_query, k=10, top_n=3)

            stock = yf.Ticker(ticker)
            info = stock.info

            pe_ratio = info.get('trailingPE', 'N/A')
            forward_pe = info.get('forwardPE', 'N/A')
            profit_margin = info.get('profitMargins', 'N/A')
            profit_margin_pct = f"{profit_margin * 100:.1f}%" if profit_margin != 'N/A' else 'N/A'

            roe = info.get('returnOnEquity', 'N/A')
            debt_to_equity = info.get('debtToEquity', 'N/A')
            revenue_growth = info.get('revenueGrowth', 'N/A')
            market_cap = info.get('marketCap', 'N/A')
            current_price = info.get('currentPrice', 'N/A')

            roe_pct = f"{roe * 100:.1f}%" if roe != 'N/A' else 'N/A'
            revenue_growth_pct = f"{revenue_growth * 100:.1f}%" if revenue_growth != 'N/A' else 'N/A'
            market_cap_fmt = f"${market_cap / 1e9:.1f}B" if market_cap != 'N/A' and market_cap < 1e12 else f"${market_cap / 1e12:.2f}T" if market_cap != 'N/A' else 'N/A'

            fundamental_summary = f"""
Symbol: {ticker}
Current Price: ${current_price}
Market Cap: {market_cap_fmt}

PRIMARY LIVE DATA:
- P/E Ratio (Trailing): {pe_ratio}
- P/E Ratio (Forward): {forward_pe}
- Profit Margin: {profit_margin_pct}

COMPREHENSIVE FINANCIAL PROFILE:
- ROE (Return on Equity): {roe_pct}
- Debt-to-Equity Ratio: {debt_to_equity}
- Revenue Growth: {revenue_growth_pct}
"""

            prompt = f"""{self.role}

QUESTION: {question}

📚 HISTORICAL BENCHMARKS (from knowledge base):
{rag_context}

📊 CURRENT LIVE DATA (Yahoo Finance):
{fundamental_summary}

Provide your fundamental analysis covering:
1. Valuation assessment (vs historical benchmarks)
2. Financial health (ROE, debt analysis)
3. Growth trajectory
4. Overall stance: Overvalued/Undervalued/Fair Value

Keep it under 150 words. Use specific numbers."""

            return generate_response(prompt)

        except Exception as e:
            return f"Fundamental analysis error for {ticker}: {str(e)}"


class MarketDataAnalyst:
    def __init__(self):
        self.name = "Market Data Analyst"
        self.role = (
            "You are a market data analyst who tracks real-time stock prices, volumes, and trends. "
            "You compare current prices to 52-week highs/lows and sector averages. "
            "You identify if a stock is near support/resistance levels."
        )

    def analyze(self, question, ticker):
        if not ticker:
            return "No ticker detected for market analysis."

        try:
            rag_query = f"{ticker} market data stock price valuation sector trends historical performance"
            rag_context = retrieve_with_rerank(rag_query, k=10, top_n=3)

            stock = yf.Ticker(ticker)
            info = stock.info

            current_price = info.get('currentPrice') or info.get('regularMarketPrice', 'N/A')
            high_52w = info.get('fiftyTwoWeekHigh', 'N/A')
            low_52w = info.get('fiftyTwoWeekLow', 'N/A')
            pe = info.get('trailingPE', 'N/A')
            forward_pe = info.get('forwardPE', 'N/A')
            sector = info.get('sector', 'Unknown')

            if current_price != 'N/A' and high_52w != 'N/A' and low_52w != 'N/A':
                range_position = ((current_price - low_52w) / (high_52w - low_52w)) * 100
            else:
                range_position = 'N/A'

            market_summary = f"""
Symbol: {ticker}
Sector: {sector}
Current Price: ${current_price}
52-Week High: ${high_52w}
52-Week Low: ${low_52w}
Position in 52W Range: {range_position:.1f}% (0%=low, 100%=high)
PE Ratio: {pe}
Forward PE: {forward_pe}
"""

            prompt = f"""{self.role}

QUESTION: {question}

📚 HISTORICAL MARKET CONTEXT (from knowledge base):
{rag_context}

📊 CURRENT MARKET DATA (Live):
{market_summary}

Provide your market analysis covering:
1. Price positioning (relative to 52W range and historical patterns)
2. Valuation assessment (PE vs sector averages from knowledge base)
3. Market momentum and sector trends

Keep it under 150 words."""

            return generate_response(prompt)

        except Exception as e:
            return f"Market analysis error for {ticker}: {str(e)}"


class RiskAnalyst:
    def __init__(self):
        self.name = "Risk Analyst"
        self.role = (
            "You are a risk analyst whose job is to identify what could go wrong. "
            "You focus on: market volatility, regulatory risks, competition, macroeconomic threats. "
            "You play devil's advocate."
        )

    def analyze(self, question, ticker=None):
        ticker_context = f"for {ticker}" if ticker else ""

        if ticker:
            rag_query = f"{ticker} risks challenges regulatory competition threats downturn vulnerabilities"
            rag_context = retrieve_with_rerank(rag_query, k=10, top_n=3)
        else:
            rag_query = f"investment risks market volatility economic threats"
            rag_context = retrieve_with_rerank(rag_query, k=10, top_n=3)

        prompt = f"""{self.role}

QUESTION: {question}

📚 HISTORICAL RISK PATTERNS (from knowledge base):
{rag_context}

Provide a risk assessment {ticker_context} covering:
1. Top 3 specific risks - Cite historical precedents if available
2. Bear case scenario - Based on historical downturns
3. Risk rating: Low/Medium/High - Justify with data

Keep it under 150 words. Be skeptical but DATA-DRIVEN."""

        return generate_response(prompt)


class TechnicalAnalyst:
    def __init__(self):
        self.name = "Technical Analyst"
        self.role = (
            "You are a technical analyst who studies price charts and momentum indicators. "
            "You analyze: moving averages (50-day, 200-day), RSI, price trends, support/resistance. "
            "You identify bullish or bearish technical patterns."
        )

    def analyze(self, question, ticker):
        if not ticker:
            return "No ticker symbol detected for technical analysis."

        try:
            rag_query = f"{ticker} technical analysis chart patterns support resistance RSI moving average"
            rag_context = retrieve_with_rerank(rag_query, k=10, top_n=3)

            stock = yf.Ticker(ticker)
            hist = stock.history(period="1y")

            if hist.empty:
                return f"Could not retrieve historical data for {ticker}."

            current_price = hist['Close'].iloc[-1]
            ma_50 = hist['Close'].rolling(window=50).mean().iloc[-1] if len(hist) >= 50 else None
            ma_200 = hist['Close'].rolling(window=200).mean().iloc[-1] if len(hist) >= 200 else None

            delta = hist['Close'].diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
            rs = gain / loss
            rsi = 100 - (100 / (1 + rs))
            current_rsi = rsi.iloc[-1]

            price_1mo_ago = hist['Close'].iloc[-22] if len(hist) >= 22 else hist['Close'].iloc[0]
            momentum_1mo = ((current_price - price_1mo_ago) / price_1mo_ago) * 100

            high_52w = hist['High'].max()
            low_52w = hist['Low'].min()
            position_in_range = ((current_price - low_52w) / (high_52w - low_52w)) * 100

            technical_summary = f"""
Symbol: {ticker}
Current Price: ${current_price:.2f}

MOVING AVERAGES:
- 50-Day MA: ${ma_50:.2f if ma_50 else 'N/A'}
- 200-Day MA: ${ma_200:.2f if ma_200 else 'N/A'}

MOMENTUM INDICATORS:
- RSI (14-day): {current_rsi:.1f} {'(Overbought)' if current_rsi > 70 else '(Oversold)' if current_rsi < 30 else '(Neutral)'}
- 1-Month Return: {momentum_1mo:+.1f}%
- Position in 52W Range: {position_in_range:.0f}%

TREND:
- Price vs 50-MA: {'Above (Bullish)' if ma_50 and current_price > ma_50 else 'Below (Bearish)' if ma_50 else 'N/A'}
- Price vs 200-MA: {'Above (Bullish)' if ma_200 and current_price > ma_200 else 'Below (Bearish)' if ma_200 else 'N/A'}
- Golden Cross: {'Yes' if ma_50 and ma_200 and ma_50 > ma_200 else 'No'}
"""

            prompt = f"""{self.role}

QUESTION: {question}

📚 HISTORICAL TECHNICAL PATTERNS (from knowledge base):
{rag_context}

📊 CURRENT TECHNICAL DATA:
{technical_summary}

Provide your technical analysis:
1. Overall trend (bullish/bearish/neutral)
2. Key support/resistance levels
3. Momentum interpretation
4. Entry/exit recommendation

Keep it under 150 words."""

            return generate_response(prompt)

        except Exception as e:
            return f"Technical analysis error for {ticker}: {str(e)}"


class ChiefAnalyst:
    def __init__(self):
        self.name = "Chief Investment Officer"
        self.role = (
            "You are the Chief Investment Officer who synthesizes all analyst inputs. "
            "You make the FINAL investment decision: BUY, SELL, or HOLD. "
            "You provide a conviction score (1-10) and a clear, actionable recommendation."
        )

    def synthesize(self, question, ticker, fundamental, market, risk, technical):
        if ticker:
            rag_query = f"{ticker} investment recommendation analyst consensus price targets"
            rag_context = retrieve_with_rerank(rag_query, k=10, top_n=3)
        else:
            rag_context = "No specific historical data available."

        prompt = f"""{self.role}

QUESTION: {question}

📚 HISTORICAL CONTEXT:
{rag_context}

ANALYST INPUTS:

FUNDAMENTAL ANALYST:
{fundamental}

MARKET DATA ANALYST:
{market}

RISK ANALYST:
{risk}

TECHNICAL ANALYST:
{technical}

SCORING: Score each analyst as Bullish (+1), Neutral (0), or Bearish (-1), then sum.
- Total +2 or higher → BUY (conviction 7-9/10)
- Total +1 → BUY with conditions (conviction 6-7/10)
- Total 0 → HOLD (conviction 5-6/10)
- Total -1 → SELL with conditions (conviction 6-7/10)
- Total -2 or lower → SELL (conviction 7-9/10)

**FINAL RECOMMENDATION:**
1. **Rating:** [BUY/SELL/HOLD]
2. **Conviction:** [X/10]
3. **Analyst Consensus:** [Show scoring]
4. **Key reasoning:** [2 sentences]
5. **Action plan:** [Specific entry/exit prices, stop-loss, timeline]

Keep under 130 words. Be decisive and specific."""

        return generate_response(prompt)


# Initialize all agents
agent1 = FundamentalAnalyst()
agent2 = MarketDataAnalyst()
agent3 = RiskAnalyst()
agent4_technical = TechnicalAnalyst()
moderator = ChiefAnalyst()

print("✅ All 5 investment agents initialized!")

✅ All 5 investment agents initialized!


In [13]:
# ===================================================================
# CELL 13: VALIDATION & ORCHESTRATION FUNCTIONS
# ===================================================================

def validate_analysis(fundamental_output):
    """Validates fundamental analysis for common interpretation errors."""
    warnings = []
    if "roe" in fundamental_output.lower():
        if any(phrase in fundamental_output.lower() for phrase in
               ["low roe", "poor roe", "weak roe", "concerning roe"]):
            roe_match = re.search(r'roe[:\s]+(\d+\.?\d*)%', fundamental_output.lower())
            if roe_match:
                roe_val = float(roe_match.group(1))
                if roe_val > 20:
                    warnings.append(f"⚠️ WARNING: ROE of {roe_val}% is EXCELLENT (>20%), but analysis calls it concerning.")
    return warnings


def validate_all_outputs(result):
    """Enhanced validation for all agents"""
    warnings = []
    fund_warnings = validate_analysis(result['fundamental'])
    warnings.extend(fund_warnings)

    if 'risk' in result:
        risk_text = result['risk'].lower()
        if not any(year in risk_text for year in ['2008', '2009', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']):
            warnings.append("⚠️ Risk analysis lacks specific historical dates")

    if 'final' in result:
        final_text = result['final']
        if 'timeline' not in final_text.lower() and 'month' not in final_text.lower() and 'week' not in final_text.lower():
            warnings.append("⚠️ Final recommendation missing timeline")

    return warnings


def run_analyst_team(question):
    """Orchestrates all 5 agents to analyze an investment question."""
    print("=" * 60)
    print(f"INVESTMENT QUESTION: {question}")
    print("=" * 60)

    ticker = extract_ticker(question)

    if not ticker:
        print("⚠️  No ticker detected in question\n")
    else:
        print(f"📊 Detected ticker: {ticker}\n")

    print("💼 Fundamental Analyst is analyzing...\n")
    fundamental = agent1.analyze(question, ticker)
    print(fundamental)
    print("\n" + "-" * 60)

    print("📈 Market Data Analyst is analyzing...\n")
    market = agent2.analyze(question, ticker)
    print(market)
    print("\n" + "-" * 60)

    print("⚠️  Risk Analyst is analyzing...\n")
    risk = agent3.analyze(question, ticker)
    print(risk)
    print("\n" + "-" * 60)

    print("📊 Technical Analyst is analyzing...\n")
    technical = agent4_technical.analyze(question, ticker)
    print(technical)
    print("\n" + "-" * 60)

    print("🎯 Chief Investment Officer is synthesizing...\n")
    final = moderator.synthesize(question, ticker, fundamental, market, risk, technical)
    print(final)
    print("\n" + "=" * 60)

    result = {
        'ticker': ticker,
        'fundamental': fundamental,
        'market': market,
        'risk': risk,
        'technical': technical,
        'final': final
    }

    print("\n🔍 RUNNING VALIDATION CHECKS")
    print("=" * 60)

    warnings = validate_all_outputs(result)
    if warnings:
        print("\n⚠️  VALIDATION WARNINGS FOUND:")
        for warning in warnings:
            print(f"   {warning}")
    else:
        print("✅ All validation checks passed!")

    print("=" * 60 + "\n")

    return result


print("✅ Validation and orchestration functions ready")

✅ Validation and orchestration functions ready


In [14]:
# ===================================================================
# CELL 14: 🆕 ENHANCED GRADIO UI WITH FILE UPLOAD
# ===================================================================

custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap');

.gradio-container {
    font-family: 'Inter', sans-serif !important;
    max-width: 1400px !important;
    margin: auto !important;
}

.main-header {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    padding: 2rem;
    border-radius: 15px;
    margin-bottom: 2rem;
    box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);
}

.main-title {
    font-size: 2.5rem;
    font-weight: 700;
    color: white;
    margin: 0;
    text-align: center;
}

.subtitle {
    font-size: 1.1rem;
    color: rgba(255,255,255,0.9);
    text-align: center;
    margin-top: 0.5rem;
}

.agent-card {
    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
    border-radius: 12px;
    padding: 1.5rem;
    margin: 1rem 0;
    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
    border-left: 5px solid;
}

.fundamental-card { border-left-color: #3498db; }
.market-card { border-left-color: #2ecc71; }
.risk-card { border-left-color: #e74c3c; }
.technical-card { border-left-color: #9b59b6; }
.statement-card { border-left-color: #1abc9c; }
.final-card {
    border-left-color: #f39c12;
    background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
}

.analyze-btn {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    border: none !important;
    color: white !important;
    font-weight: 600 !important;
    font-size: 1.1rem !important;
    padding: 0.8rem 2rem !important;
    border-radius: 10px !important;
    box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
}

.tab-selected {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    color: white !important;
}
"""

def format_stock_output(result):
    """Format the multi-agent output for stock analysis"""
    def markdown_to_html(text):
        text = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', text)
        text = text.replace('\n', '<br>')
        return text

    fundamental_html = markdown_to_html(result['fundamental'])
    market_html = markdown_to_html(result['market'])
    risk_html = markdown_to_html(result['risk'])
    technical_html = markdown_to_html(result['technical'])
    final_html = markdown_to_html(result['final'])

    output = f"""
<div class="main-header">
    <h1 class="main-title">📊 Investment Analysis Report</h1>
    <p class="subtitle">Powered by 5 AI Specialist Agents</p>
</div>

<div class="agent-card fundamental-card">
    <h2 style="color: #3498db; margin-top: 0;">💼 Fundamental Analysis</h2>
    <div class="agent-output">{fundamental_html}</div>
</div>

<div class="agent-card market-card">
    <h2 style="color: #2ecc71; margin-top: 0;">📈 Market Data Analysis</h2>
    <div class="agent-output">{market_html}</div>
</div>

<div class="agent-card risk-card">
    <h2 style="color: #e74c3c; margin-top: 0;">⚠️ Risk Analysis</h2>
    <div class="agent-output">{risk_html}</div>
</div>

<div class="agent-card technical-card">
    <h2 style="color: #9b59b6; margin-top: 0;">📊 Technical Analysis</h2>
    <div class="agent-output">{technical_html}</div>
</div>

<div class="agent-card final-card">
    <h2 style="color: #e67e22; margin-top: 0;">🎯 Final Recommendation</h2>
    <div class="agent-output">{final_html}</div>
</div>
"""
    return output


def format_statement_output(analysis):
    """Format the financial statement analysis output"""
    analysis_html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', analysis)
    analysis_html = analysis_html.replace('\n', '<br>')

    output = f"""
<div class="main-header">
    <h1 class="main-title">📑 Financial Statement Analysis</h1>
    <p class="subtitle">Powered by AI Financial Statement Analyst</p>
</div>

<div class="agent-card statement-card">
    <h2 style="color: #1abc9c; margin-top: 0;">📋 Analysis Report</h2>
    <div class="agent-output">{analysis_html}</div>
</div>
"""
    return output


def analyze_stock_ui(question):
    """UI function for stock analysis"""
    if not question.strip():
        return "<p style='color: red;'>Please enter an investment question.</p>"
    try:
        result = run_analyst_team(question)
        return format_stock_output(result)
    except Exception as e:
        return f"<p style='color: red;'>Error: {str(e)}</p>"


def analyze_statement_ui(file, question):
    """UI function for financial statement analysis"""
    if file is None:
        return "<p style='color: red;'>Please upload a financial statement file (PDF, Excel, or CSV).</p>"
    try:
        analysis = statement_analyst.analyze(file.name, question)
        return format_statement_output(analysis)
    except Exception as e:
        return f"<p style='color: red;'>Error analyzing file: {str(e)}</p>"


# Create Gradio Interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(
    primary_hue="purple",
    secondary_hue="blue"
)) as demo:

    # Header
    gr.HTML("""
    <div class="main-header">
        <h1 class="main-title">🏦 AI Finance Chatbot v3</h1>
        <p class="subtitle">Stock Analysis • Market Data • Risk Assessment • Financial Statement Analysis</p>
    </div>
    """)

    # Tabs for different functionalities
    with gr.Tabs():

        # Tab 1: Stock Analysis
        with gr.TabItem("📈 Stock Analysis", id="stock_tab"):
            gr.Markdown("### Ask about any stock for a comprehensive 5-agent analysis")

            with gr.Row():
                with gr.Column(scale=4):
                    stock_question = gr.Textbox(
                        placeholder="e.g., Should I invest in AAPL? Is TSLA overvalued? What's your analysis on NVDA?",
                        label="Your Investment Question",
                        lines=2
                    )
                with gr.Column(scale=1):
                    stock_btn = gr.Button("🔍 Analyze Stock", variant="primary", elem_classes="analyze-btn")

            gr.HTML("<h4>💡 Quick Examples:</h4>")
            gr.Examples(
                examples=[
                    ["Should I invest in AAPL?"],
                    ["Is TSLA a good buy right now?"],
                    ["What's your analysis on NVDA?"],
                    ["Should I buy MSFT stock?"],
                    ["Is GOOGL overvalued?"],
                ],
                inputs=stock_question,
                label=None
            )

            stock_output = gr.HTML(label="Analysis Report")

            stock_btn.click(
                fn=analyze_stock_ui,
                inputs=stock_question,
                outputs=stock_output
            )

        # Tab 2: Financial Statement Analysis (NEW)
        with gr.TabItem("📑 Financial Statement Analysis", id="statement_tab"):
            gr.Markdown("""
            ### Upload a financial statement for detailed analysis

            **Supported formats:** PDF, Excel (.xlsx, .xls), CSV

            **What you'll get:**
            - 📋 Statement Overview (type, company, time period)
            - 💰 Key Financial Metrics & Ratios
            - 💪 Strengths & Positive Indicators
            - ⚠️ Cautions & Red Flags
            - 🎯 Crucial Points for Investors
            - 📈 Recommendations
            """)

            with gr.Row():
                with gr.Column(scale=2):
                    file_upload = gr.File(
                        label="Upload Financial Statement",
                        file_types=[".pdf", ".xlsx", ".xls", ".csv"],
                        type="filepath"
                    )
                with gr.Column(scale=2):
                    statement_question = gr.Textbox(
                        placeholder="(Optional) Any specific question about the statement? e.g., 'What is the debt situation?' or 'Is the company profitable?'",
                        label="Specific Question (Optional)",
                        lines=2
                    )

            statement_btn = gr.Button("📊 Analyze Statement", variant="primary", elem_classes="analyze-btn")

            gr.Markdown("""
            **Example files you can upload:**
            - Annual Reports (10-K)
            - Quarterly Reports (10-Q)
            - Balance Sheets
            - Income Statements
            - Cash Flow Statements
            - Company Financial Summaries
            """)

            statement_output = gr.HTML(label="Statement Analysis")

            statement_btn.click(
                fn=analyze_statement_ui,
                inputs=[file_upload, statement_question],
                outputs=statement_output
            )

        # Tab 3: About
        with gr.TabItem("ℹ️ About", id="about_tab"):
            gr.Markdown("""
            ## 🏦 AI Finance Chatbot v3

            ### Features:

            **📈 Stock Analysis (5-Agent System)**
            - 💼 **Fundamental Analyst**: P/E ratios, profit margins, ROE, debt levels
            - 📈 **Market Data Analyst**: Price trends, 52-week range, sector comparison
            - ⚠️ **Risk Analyst**: Market risks, regulatory risks, bear case scenarios
            - 📊 **Technical Analyst**: Moving averages, RSI, support/resistance levels
            - 🎯 **Chief Investment Officer**: Final BUY/SELL/HOLD recommendation

            **📑 Financial Statement Analysis (NEW!)**
            - Upload PDF, Excel, or CSV financial statements
            - Automatic extraction of key metrics
            - Identification of strengths and red flags
            - Crucial points explanation for investors

            ### Data Sources:
            - **Live Data**: Yahoo Finance (real-time prices, fundamentals)
            - **Knowledge Base**: FinanceBench dataset (150+ financial documents)
            - **RAG System**: Retrieval-Augmented Generation with reranking

            ### Technologies:
            - LLM: Groq (Llama 3.3 70B)
            - Embeddings: Sentence Transformers
            - Vector DB: FAISS
            - Reranker: Cross-Encoder

            ---
            **Disclaimer**: This tool provides AI-generated analysis for educational purposes only.
            It is not financial advice. Always consult a qualified financial advisor before making investment decisions.
            """)

# Launch the app
demo.launch(
    share=True,
    server_name="0.0.0.0",
    server_port=7860,
    show_error=True
)

print("✅ Enhanced UI with Financial Statement Analysis launched!")
print("🌐 Local URL: http://localhost:7860")

  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fa1f989497c9ba7e10.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


✅ Enhanced UI with Financial Statement Analysis launched!
🌐 Local URL: http://localhost:7860


In [15]:
# ===================================================================
# CELL 15: CLOSE GRADIO (Run when done)
# ===================================================================
demo.close()
print("✅ Gradio app closed")
print("✅ Port 7860 is now free")

Closing server running on port: 7860
✅ Gradio app closed
✅ Port 7860 is now free


## 🧪 Testing Cells

Use these cells to test individual components

In [16]:
# ===================================================================
# TEST: Financial Statement Analyst (without file)
# ===================================================================

# Test with sample text
sample_statement = """
ACME Corporation
Balance Sheet - Year Ended December 31, 2024

ASSETS
Current Assets:
  Cash and Cash Equivalents: $5,200,000
  Accounts Receivable: $3,800,000
  Inventory: $2,100,000
  Total Current Assets: $11,100,000

Non-Current Assets:
  Property, Plant & Equipment: $15,000,000
  Intangible Assets: $2,500,000
  Total Non-Current Assets: $17,500,000

TOTAL ASSETS: $28,600,000

LIABILITIES
Current Liabilities:
  Accounts Payable: $2,200,000
  Short-term Debt: $1,500,000
  Total Current Liabilities: $3,700,000

Long-term Liabilities:
  Long-term Debt: $8,000,000
  Total Long-term Liabilities: $8,000,000

TOTAL LIABILITIES: $11,700,000

SHAREHOLDERS' EQUITY
  Common Stock: $5,000,000
  Retained Earnings: $11,900,000
  Total Shareholders' Equity: $16,900,000

TOTAL LIABILITIES AND EQUITY: $28,600,000

INCOME STATEMENT - Year Ended December 31, 2024
Total Revenue: $45,000,000
Cost of Goods Sold: $27,000,000
Gross Profit: $18,000,000
Operating Expenses: $10,500,000
Operating Income: $7,500,000
Interest Expense: $640,000
Net Income: $5,144,000
"""

# Test metrics extraction
test_data = {'text': sample_statement, 'tables': [], 'file_type': 'test'}
metrics = statement_parser.extract_financial_metrics(test_data)
print("Extracted Metrics:")
for k, v in metrics.items():
    print(f"  {k}: ${v:,.2f}")

Extracted Metrics:
  revenue: $45,000,000.00
  net_income: $5,144,000.00
  total_assets: $28,600,000.00
  total_liabilities: $11,700,000.00
  operating_income: $7,500,000.00
  gross_profit: $18,000,000.00
  cash: $5,200,000.00


In [17]:
# ===================================================================
# TEST: Full Stock Analysis
# ===================================================================

result = run_analyst_team("Should I invest in AAPL?")
print("\n✅ Test complete!")

INVESTMENT QUESTION: Should I invest in AAPL?
📊 Detected ticker: AAPL

💼 Fundamental Analyst is analyzing...

Based on fundamental analysis, AAPL's valuation is high with a P/E ratio of 36.54, exceeding the growth stock threshold. However, its exceptional ROE of 171.4% and profit margin of 26.9% justify premium valuation. The debt-to-equity ratio of 152.41 is concerning, but AAPL's strong profitability can manage this debt. Revenue growth of 7.9% is moderate. Considering its exceptional profitability and moderate growth, AAPL is fairly valued. Its forward P/E of 29.84 indicates a potential correction. Overall, AAPL is fairly valued, but its high debt levels and rich valuation warrant caution.

------------------------------------------------------------
📈 Market Data Analyst is analyzing...

Based on current market data, AAPL's price is 86.2% into its 52-week range, suggesting it's nearing resistance levels. Historically, this positioning may indicate a potential pullback. 

Valuation-

In [18]:
# ===================================================================
# SYSTEM CHECK
# ===================================================================

print("🔍 CHECKING SYSTEM COMPONENTS\n")

checks = {
    "RAG loaded": len(df_rag) > 100,
    "Reranker loaded": 'reranker' in dir(),
    "Statement parser": 'statement_parser' in dir(),
    "Statement analyst": 'statement_analyst' in dir(),
    "Fundamental agent": 'agent1' in dir(),
    "Market agent": 'agent2' in dir(),
    "Risk agent": 'agent3' in dir(),
    "Technical agent": 'agent4_technical' in dir(),
    "Chief analyst": 'moderator' in dir(),
    "run_analyst_team": 'run_analyst_team' in dir(),
}

for name, status in checks.items():
    print(f"  {'✅' if status else '❌'} {name}")

if all(checks.values()):
    print("\n🎉 All systems ready!")
else:
    print("\n⚠️ Some components missing. Run the cells above.")

🔍 CHECKING SYSTEM COMPONENTS

  ✅ RAG loaded
  ✅ Reranker loaded
  ✅ Statement parser
  ✅ Statement analyst
  ✅ Fundamental agent
  ✅ Market agent
  ✅ Risk agent
  ✅ Technical agent
  ✅ Chief analyst
  ✅ run_analyst_team

🎉 All systems ready!


In [19]:
import re
import numpy as np
import faiss

# ============================================================
# 1) Retrieval
# ============================================================
def retrieve_context_with_ids(query: str, k: int = 5):
    """
    Returns:
      {
        "idxs": [row indices in df_rag],
        "scores": [faiss similarity scores],
        "contexts": [df_rag CONTEXT strings]
      }
    """
    q_emb = embed_model.encode([query])
    q_emb = np.array(q_emb, dtype="float32")

    # Normalize for cosine similarity (works if you normalized doc embeddings too)
    faiss.normalize_L2(q_emb)

    scores, idxs = index_rag.search(q_emb, k)

    idxs = idxs[0].tolist()
    scores = scores[0].tolist()

    contexts = []
    for i in idxs:
        if 0 <= i < len(df_rag):
            contexts.append(str(df_rag.iloc[i]["CONTEXT"]))
        else:
            contexts.append("")

    return {"idxs": idxs, "scores": scores, "contexts": contexts}


# ============================================================
# 2) Keyword relevance (synonym groups)
# ============================================================
def keyword_relevance(text: str, keyword_groups):
    """
    keyword_groups: list[list[str]]
      e.g. [["p/e","pe","price to earnings"], ["earnings","eps"]]

    Score = fraction of groups matched (>=1 synonym per group).
    """
    t = (text or "").lower()
    if not keyword_groups:
        return 0.0

    matched = 0
    for group in keyword_groups:
        group = [str(x).lower() for x in group]
        if any(re.search(r"\b" + re.escape(g) + r"\b", t) for g in group):
            matched += 1

    return matched / len(keyword_groups)


# ============================================================
# 3) Semantic relevance (cosine similarity)
# ============================================================
def cosine_sim(a, b):
    a = np.array(a, dtype="float32")
    b = np.array(b, dtype="float32")
    a = a / (np.linalg.norm(a) + 1e-12)
    b = b / (np.linalg.norm(b) + 1e-12)
    return float(np.dot(a, b))

def semantic_relevance(query: str, context: str):
    q = embed_model.encode([query])[0]
    c = embed_model.encode([context])[0]
    return cosine_sim(q, c)


# ============================================================
# 4) Evaluator (STRICT HIT + debug prints)
# ============================================================
def evaluate_relevance_only(
    test_cases,
    k: int = 5,
    kw_threshold: float = 0.66,
    sem_threshold: float = 0.35,
    debug_top_n: int = 3,
    preview_chars: int = 500
):
    """
    STRICT HIT:
      HIT = (keyword_relevance >= kw_threshold) AND (semantic_relevance >= sem_threshold)

    This avoids false positives (e.g., matching "ratio" in unrelated contexts).
    """
    hit = 0
    avg_kw = 0.0
    avg_sem = 0.0

    for case in test_cases:
        question = case["question"]
        keyword_groups = case.get("keywords", [])

        out = retrieve_context_with_ids(question, k=k)
        combined = "\n\n".join(out["contexts"])

        kw = keyword_relevance(combined, keyword_groups)
        sem = semantic_relevance(question, combined)

        avg_kw += kw
        avg_sem += sem

        is_hit = (kw >= kw_threshold) and (sem >= sem_threshold)
        hit += 1 if is_hit else 0

        print("\n" + "=" * 80)
        print(f"Q: {question}")
        print(f"keyword_relevance: {kw:.2f} | semantic_relevance: {sem:.2f} | HIT: {is_hit}")
        print("-" * 80)
        print("Top retrieved contexts:")

        top = min(debug_top_n, len(out["contexts"]))
        for i in range(top):
            idx = out["idxs"][i]
            score = out["scores"][i]
            ctx = out["contexts"][i]
            print(f"\n#{i+1} | row_idx={idx} | faiss_score={score:.4f}")
            print(ctx[:preview_chars])

    n = len(test_cases)
    print("\n" + "=" * 80)
    print("===== Relevance Summary =====")
    print(f"HitRate (kw>={kw_threshold} AND sem>={sem_threshold}): {hit/n:.2%}")
    print(f"Avg Keyword Relevance: {avg_kw/n:.3f}")
    print(f"Avg Semantic Relevance: {avg_sem/n:.3f}")
    print("=" * 80)


# ============================================================
# 5) Example test cases + RUN
# ============================================================
test_cases = [
    {
        "question": "What is PE ratio?",
        "keywords": [
            ["p/e", "pe", "price to earnings", "price-to-earnings"],
            ["earnings", "eps"]
        ]
    },
    {
        "question": "What is current ratio?",
        "keywords": [
            ["current ratio"],
            ["current assets"],
            ["current liabilities"]
        ]
    },
]

evaluate_relevance_only(
    test_cases,
    k=5,
    kw_threshold=0.66,
    sem_threshold=0.35,
    debug_top_n=3,
    preview_chars=500
)



Q: What is PE ratio?
keyword_relevance: 1.00 | semantic_relevance: 0.52 | HIT: True
--------------------------------------------------------------------------------
Top retrieved contexts:

#1 | row_idx=125 | faiss_score=0.6756
PE RATIO (Price-to-Earnings): Current price / EPS. Shows how much investors pay per $1 earnings. High PE >30 = growth stock. Low PE <15 = value stock. Compare within sector.

#2 | row_idx=126 | faiss_score=0.3673
ROE (Return on Equity): Net Income / Shareholder Equity. Measures profitability. >20% = excellent, 15-20% = good, <10% = poor. AAPL ROE ~150% (exceptional), average tech ~15-25%.

#3 | row_idx=123 | faiss_score=0.3225
V LIVE VALUATION: PE 33.95584, Forward PE 23.983652, Market Cap $667,763,736,576, Price $346.01, Sector Financial Services

Q: What is current ratio?
keyword_relevance: 0.67 | semantic_relevance: 0.32 | HIT: False
--------------------------------------------------------------------------------
Top retrieved contexts:

#1 | row_idx=125 | f