In [12]:
import requests
import pandas as pd
import json
import time
from datetime import datetime, timedelta
import os
from typing import Dict, List, Optional
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import re
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SECBankDataExtractor:
    def __init__(self, user_agent: str = "akshayavaradu@gmail.com"):
        """
        Initialize the SEC data extractor
        
        Args:
            user_agent: Required user agent for SEC API requests
        """
        self.user_agent = user_agent
        self.base_url = "https://data.sec.gov"
        self.headers = {
            'User-Agent': self.user_agent,
            'Accept-Encoding': 'gzip, deflate',
            'Host': 'data.sec.gov'
        }
        
        # Top 20 US Banks by Assets (CIK numbers)
        self.top_banks = {
            'JPMorgan Chase & Co.': '0000019617',
            'Bank of America Corp': '0000070858',
            'Wells Fargo & Company': '0000072971',
            'Citigroup Inc.': '0000831001',
            'U.S. Bancorp': '0000036104',
            'PNC Financial Services Group Inc': '0000713676',
            'Goldman Sachs Group Inc': '0000886982',
            'Truist Financial Corporation': '0000092230',
            'Capital One Financial Corp': '0000927628',
            'Bank of New York Mellon Corp': '0001390777',
            'Charles Schwab Corporation': '0000316709',
            'TD Group US Holdings LLC': '0001843204',
            'American Express Company': '0000004962',
            'State Street Corporation': '0000093751',
            'Citizens Financial Group Inc': '0001378946',
            'Fifth Third Bancorp': '0000035527',
            'KeyCorp': '0000091576',
            'Regions Financial Corporation': '0000039899',
            'Northern Trust Corporation': '0000073124',
            'Huntington Bancshares Inc': '0000049196'
        }
        
        # Initialize target_metrics as empty - we'll collect all available metrics
        self.target_metrics = []
    
    def get_company_facts(self, cik: str) -> Optional[Dict]:
        """
        Get company facts from SEC API
        
        Args:
            cik: Company CIK number
            
        Returns:
            Dictionary containing company facts or None if error
        """
        try:
            url = f"{self.base_url}/api/xbrl/companyfacts/CIK{cik.zfill(10)}.json"
            response = requests.get(url, headers=self.headers)
            
            if response.status_code == 200:
                return response.json()
            else:
                logger.warning(f"Failed to get company facts for CIK {cik}: {response.status_code}")
                return None
                
        except Exception as e:
            logger.error(f"Error getting company facts for CIK {cik}: {str(e)}")
            return None
    
    def get_company_filings(self, cik: str, start_date: str, end_date: str) -> Optional[Dict]:
        """
        Get company filings from SEC API
        
        Args:
            cik: Company CIK number
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            
        Returns:
            Dictionary containing filings data or None if error
        """
        try:
            url = f"{self.base_url}/api/xbrl/submissions/CIK{cik.zfill(10)}.json"
            response = requests.get(url, headers=self.headers)
            
            if response.status_code == 200:
                data = response.json()
                # Filter filings by date and form type
                filings = []
                recent_filings = data.get('filings', {}).get('recent', {})
                
                if recent_filings:
                    for i in range(len(recent_filings.get('form', []))):
                        form_type = recent_filings['form'][i]
                        filing_date = recent_filings['filingDate'][i]
                        
                        if form_type in ['10-K', '10-Q'] and start_date <= filing_date <= end_date:
                            filings.append({
                                'form': form_type,
                                'filingDate': filing_date,
                                'accessionNumber': recent_filings['accessionNumber'][i],
                                'reportDate': recent_filings.get('reportDate', [''])[i],
                                'primaryDocument': recent_filings.get('primaryDocument', [''])[i]
                            })
                
                return {'filings': filings}
            else:
                logger.warning(f"Failed to get filings for CIK {cik}: {response.status_code}")
                return None
                
        except Exception as e:
            logger.error(f"Error getting filings for CIK {cik}: {str(e)}")
            return None
    
    def extract_financial_metrics(self, company_facts: Dict, start_date: str, end_date: str) -> List[Dict]:
        """
        Extract financial metrics from company facts
        
        Args:
            company_facts: Company facts data from SEC API
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            
        Returns:
            List of dictionaries containing extracted metrics
        """
        metrics_data = []
        
        try:
            facts = company_facts.get('facts', {})
            us_gaap = facts.get('us-gaap', {})
            dei = facts.get('dei', {})
            
            # Get all available metrics from us-gaap
            all_metrics = list(us_gaap.keys())
            
            # Update target_metrics with all available metrics (without duplicates)
            for metric in all_metrics:
                if metric not in self.target_metrics:
                    self.target_metrics.append(metric)
            
            # Get all available periods
            all_periods = set()
            for tag in all_metrics:
                if tag in us_gaap:
                    for unit_type in us_gaap[tag].get('units', {}):
                        for entry in us_gaap[tag]['units'][unit_type]:
                            if 'end' in entry and start_date <= entry['end'] <= end_date:
                                all_periods.add(entry['end'])
            
            # Extract data for each period
            for period_end in sorted(all_periods):
                period_data = {
                    'BANK NAME': company_facts.get('entityName', 'Unknown'),
                    'DATETIME': period_end
                }
                
                # Extract each metric
                for tag in all_metrics:
                    if tag in us_gaap:
                        for unit_type in us_gaap[tag].get('units', {}):
                            for entry in us_gaap[tag]['units'][unit_type]:
                                if entry.get('end') == period_end and 'val' in entry:
                                    # Prefer quarterly data (form 10-Q) over annual, and recent over old
                                    if entry.get('form') in ['10-Q', '10-K']:
                                        period_data[tag] = entry['val']
                                        break
                
                # Only add if we have some meaningful data
                if len(period_data) > 2:  # More than just bank name and datetime
                    metrics_data.append(period_data)
        
        except Exception as e:
            logger.error(f"Error extracting metrics: {str(e)}")
        
        return metrics_data
    
    def download_bank_data(self, start_date: str = "2023-01-01", end_date: str = "2025-12-31") -> pd.DataFrame:
        """
        Download earnings data for all banks
        
        Args:
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            
        Returns:
            DataFrame containing all bank earnings data
        """
        all_data = []
        
        logger.info(f"Starting data download for {len(self.top_banks)} banks from {start_date} to {end_date}")
        
        for i, (bank_name, cik) in enumerate(self.top_banks.items()):
            logger.info(f"Processing {i+1}/{len(self.top_banks)}: {bank_name}")
            
            try:
                # Get company facts
                company_facts = self.get_company_facts(cik)
                if company_facts:
                    # Extract metrics
                    bank_metrics = self.extract_financial_metrics(company_facts, start_date, end_date)
                    all_data.extend(bank_metrics)
                    
                    logger.info(f"Extracted {len(bank_metrics)} records for {bank_name}")
                else:
                    logger.warning(f"No company facts found for {bank_name}")
                
                # Rate limiting - SEC allows 10 requests per second
                time.sleep(0.1)
                
            except Exception as e:
                logger.error(f"Error processing {bank_name}: {str(e)}")
                continue
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        if not df.empty:
            # Convert datetime column
            df['DATETIME'] = pd.to_datetime(df['DATETIME'])
            
            # Sort by company and date
            df = df.sort_values(['BANK NAME', 'DATETIME'])
            
            # Reset index
            df = df.reset_index(drop=True)
            
            logger.info(f"Successfully extracted {len(df)} total records with {len(self.target_metrics)} metrics")
        else:
            logger.warning("No data was extracted")
        
        return df
    
    def save_to_excel(self, df: pd.DataFrame, filename: str = "bank_financial_data.xlsx"):
        """
        Save DataFrame to Excel file with the requested format
        
        Args:
            df: DataFrame to save
            filename: Output filename
        """
        try:
            # Create Excel writer
            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
                # Main data sheet
                df.to_excel(writer, sheet_name='Bank_Financial_Data', index=False)
            
            logger.info(f"Data saved to {filename}")
            
        except Exception as e:
            logger.error(f"Error saving to Excel: {str(e)}")

def main():
    """
    Main function to run the SEC bank data extraction
    """
    # Initialize extractor
    # IMPORTANT: Replace with your actual contact information
    extractor = SECBankDataExtractor(user_agent="akshayavaradu@gmail.com")
    
    # Set date range (default 2019-2025)
    start_date = "2019-01-01"
    end_date = "2025-12-31"
    
    print(f"SEC Bank Financial Data Extractor")
    print(f"=================================")
    print(f"Downloading data from {start_date} to {end_date}")
    print(f"Target banks: {len(extractor.top_banks)}")
    print()
    
    # Download data
    df = extractor.download_bank_data(start_date, end_date)
    
    if not df.empty:
        print(f"\nData extraction completed!")
        print(f"Total records: {len(df)}")
        print(f"Companies with data: {df['BANK NAME'].nunique()}")
        print(f"Date range: {df['DATETIME'].min()} to {df['DATETIME'].max()}")
        print(f"Total metrics collected: {len(extractor.target_metrics)}")
        
        # Save to Excel
        output_filename = f"bank_financial_data_{start_date}_{end_date}.xlsx"
        extractor.save_to_excel(df, output_filename)
        
        print(f"\nData saved to: {output_filename}")
        print(f"Columns in the output file: BANK NAME, DATETIME, and all available financial metrics")
        
        # Display sample data
        print(f"\nSample data preview:")
        print(df.iloc[:3].to_string(index=False))
        
    else:
        print("No data was extracted. Please check the logs for errors.")
        print("\nTroubleshooting suggestions:")
        print("1. Verify your User-Agent contains valid company name and email")
        print("2. Check your internet connection")
        print("3. Ensure the date range contains valid reporting periods")
        print("4. Review the logs above for specific error messages")

if __name__ == "__main__":
    main()

2025-06-27 17:28:57,036 - INFO - Starting data download for 20 banks from 2019-01-01 to 2025-12-31
2025-06-27 17:28:57,037 - INFO - Processing 1/20: JPMorgan Chase & Co.


SEC Bank Financial Data Extractor
Downloading data from 2019-01-01 to 2025-12-31
Target banks: 20



2025-06-27 17:28:58,167 - INFO - Extracted 30 records for JPMorgan Chase & Co.
2025-06-27 17:28:58,269 - INFO - Processing 2/20: Bank of America Corp
2025-06-27 17:28:59,624 - INFO - Extracted 54 records for Bank of America Corp
2025-06-27 17:28:59,727 - INFO - Processing 3/20: Wells Fargo & Company
2025-06-27 17:29:01,016 - INFO - Extracted 26 records for Wells Fargo & Company
2025-06-27 17:29:01,117 - INFO - Processing 4/20: Citigroup Inc.
2025-06-27 17:29:02,271 - INFO - Extracted 29 records for Citigroup Inc.
2025-06-27 17:29:02,373 - INFO - Processing 5/20: U.S. Bancorp
2025-06-27 17:29:03,550 - INFO - Extracted 27 records for U.S. Bancorp
2025-06-27 17:29:03,652 - INFO - Processing 6/20: PNC Financial Services Group Inc
2025-06-27 17:29:04,682 - INFO - Extracted 33 records for PNC Financial Services Group Inc
2025-06-27 17:29:04,784 - INFO - Processing 7/20: Goldman Sachs Group Inc
2025-06-27 17:29:05,842 - INFO - Extracted 32 records for Goldman Sachs Group Inc
2025-06-27 17:29:


Data extraction completed!
Total records: 570
Companies with data: 19
Date range: 2019-01-01 00:00:00 to 2025-03-31 00:00:00
Total metrics collected: 3648


2025-06-27 17:29:51,932 - INFO - Data saved to bank_financial_data_2019-01-01_2025-12-31.xlsx



Data saved to: bank_financial_data_2019-01-01_2025-12-31.xlsx
Columns in the output file: BANK NAME, DATETIME, and all available financial metrics

Sample data preview:
               BANK NAME   DATETIME  AccountsPayableAndAccruedLiabilitiesCurrentAndNoncurrent  AccumulatedOtherComprehensiveIncomeLossNetOfTax  AdditionalPaidInCapitalCommonStock  AllowanceForLoanAndLeaseLossesWriteOffs       Assets  AvailableForSaleDebtSecuritiesAccumulatedGrossUnrealizedGainBeforeTax  AvailableForSaleDebtSecuritiesAccumulatedGrossUnrealizedLossBeforeTax  AvailableForSaleDebtSecuritiesAmortizedCostBasis  AvailableForSaleSecuritiesDebtMaturitiesNextRollingTwelveMonthsAmortizedCostBasis  AvailableForSaleSecuritiesDebtMaturitiesNextRollingTwelveMonthsFairValue  AvailableForSaleSecuritiesDebtMaturitiesRollingAfterYearTenAmortizedCostBasis  AvailableForSaleSecuritiesDebtMaturitiesRollingAfterYearTenFairValue  AvailableForSaleSecuritiesDebtMaturitiesRollingYearSixThroughTenAmortizedCostBasis  AvailableForSa

In [11]:
print(type(df))
custom_path = "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/bank_data.xlsx"
# extractor.save_to_parquet(df, custom_path)

# Save to Excel - just one line!
df.to_excel(custom_path, index=False)

print(f"Data saved to bank_data.xlsx")

<class 'pandas.core.frame.DataFrame'>
Data saved to bank_data.xlsx


In [17]:
def read_text_file(name) :
    df=open('note.txt','r')
    return df.read()
def add_2_numbers(a,b):
    return (float(a)+float(b))
def calc(num):
    return ((float(num)*(float(num)-1))/2)
def get_pdf_parameters(ur1: str,ur2: str,kpi: str,quarter: str):
    return {
        "url1": ur1,
        "url2": ur2,
        "key_metric": kpi,
        "quarter": quarter
    }


In [16]:
import requests
import PyPDF2

In [18]:
function_dict={'read_text_file':read_text_file,'add_2_numbers':add_2_numbers,'calc':calc,'get_pdf_parameters':get_pdf_parameters}

In [19]:
fn1= {
    'type':'function',
    "function": {
        "name": "read_text_file",
        "description": "Reads a notepad file and returns its contents",
        "parameters": {
            "type": "object",
            "properties": {
                "name": {
                    "type": "string",
                    "description": "The full path to the notepad file"
                }
            },
            "required": ["name"]
        }
    }
}

fn2= {
    'type':'function',
    "function": {
        "name": "add_2_numbers",
        "description": "Adds 2 numbers",
        "parameters": {
            "type": "object",
            "properties": {
                "a": {
                    "type": "float",
                    "description": "First number to be added"
                },
                "b": {
                    "type": "float",
                    "description": "Second number to be added"
                }
            },
            "required": ["a","b"]
        }
    }
}
fn3= {
    'type':'function',
    "function": {
        "name": "calc",
        "description": "Does a mathematical calculation",
        "parameters": {
            "type": "object",
            "properties": {
                "num": {
                    "type": "float",
                    "description": "Number to which calculation needs to be done"
                },
                
            },
            "required": ['num']
        }
    }
}
 
fn5 = {
    "type": "function",
    "function": {
        "name": "get_pdf_parameters",
        "description": "Extracts PDF file paths and the key metric to search from the user query",
        "parameters": {
            "type": "object",
            "properties": {
                "ur1": {
                    "type": "string",
                    "description": "Provide the document 1 that needs to be read"
                },
                "ur2": {
                    "type": "string",
                    "description": "Provide the document 2 that needs to be read"
                },
                "kpi": {
                    "type": "string",
                    "description": "Provide the kpi that needs to be extracted"
                },
                "quarter": {
                    "type": "string",
                    "description": "Provide the quater for which you need the data "
                }
            },
            "required": ["ur1","ur2","kpi","quarter"]
        }
    }
}

In [20]:
OLLAMA_URL = "http://localhost:11434/api/chat"
url=OLLAMA_URL
data={
    "model":"llama3.2:3b",
    "messages":[
        {
            'role':'user',
            'content':'what is the net income for 2025Q1 from C:/Users/Akshaya V/Downloads/citi.pdf and C:/Users/Akshaya V/Downloads/jpmc.pdf pdf files' ,
        }],
    "stream":False,
    "tools":[fn1,fn2,fn3,fn5]
}
r=requests.post(url,json=data)
r.json()


{'model': 'llama3.2:3b',
 'created_at': '2025-06-15T10:49:23.8298737Z',
 'message': {'role': 'assistant',
  'content': '',
  'tool_calls': [{'function': {'name': 'get_pdf_parameters',
     'arguments': {'kpi': 'net income',
      'quarter': '2025Q1',
      'ur1': 'C:/Users/Akshaya V/Downloads/citi.pdf',
      'ur2': 'C:/Users/Akshaya V/Downloads/jpmc.pdf'}}}]},
 'done_reason': 'stop',
 'done': True,
 'total_duration': 47778027700,
 'load_duration': 6595893000,
 'prompt_eval_count': 454,
 'prompt_eval_duration': 34548329300,
 'eval_count': 58,
 'eval_duration': 6620894300}

In [21]:
f_name=r.json()['message']['tool_calls'][0]['function']['name']
f_args=r.json()['message']['tool_calls'][0]['function']['arguments']
print(f_name)
print(f_args)
print(f_args['kpi'])

get_pdf_parameters
{'kpi': 'net income', 'quarter': '2025Q1', 'ur1': 'C:/Users/Akshaya V/Downloads/citi.pdf', 'ur2': 'C:/Users/Akshaya V/Downloads/jpmc.pdf'}
net income


In [22]:
print("Please extract and return the value of " +str(f_args['kpi'])+" from each document for the quarter "+str(f_args['quarter'])+ " in JSON format.")

Please extract and return the value of net income from each document for the quarter 2025Q1 in JSON format.


In [23]:
def pdf_to_text(path):
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join(page.extract_text() or "" for page in reader.pages)

In [24]:
pdf1_text = pdf_to_text(str(f_args['ur1']))
pdf2_text = pdf_to_text(str(f_args['ur2']))
pdf=(pdf1_text+pdf2_text)

In [25]:
 prompt = f"""
    **Task:** Extract {f_args['kpi']} for {f_args['quarter']} from two bank reports.
    The metric may appear in different forms across documents:
    - Short form (e.g., "EPS") 
    - Full form (e.g., "Earnings Per Share")
    - Common variations (e.g., "Diluted EPS", "Basic EPS" for EPS)


    ---

    **JPMorgan Chase (JPMC) Report:**
    {pdf2_text}

    ---

    **Citigroup (Citi) Report:**
    {pdf1_text}

    ---

    **Rules:**
    1. Search for {f_args['quarter']} (accept equivalents: "Q1 2025", "first quarter", "Jan-Mar 2025")
    2. Recognize all forms of the KPI (e.g., both "EPS" and "Earnings Per Share")
    3. Focus on financial highlights and tables first
    4. Return "not found" if unavailable

    **Output Format (JSON):**
    {{
      "jpmc": "<value or 'not found'>",
      "citi": "<value or 'not found'>"
    }}
    
    Key improvements:
    
    1. **Term Variation Handling**:
       - Explicitly mentions short/full forms (EPS vs Earnings Per Share)
       - Includes common financial variants (Diluted/Basic EPS)
    
    2. **Clear Examples**:
       - Shows exactly how different formats should be parsed
       - Demonstrates quarter name variations
    
    3. **Enhanced Output**:
       - Added "unit" field to capture currency consistency
       - Strict JSON formatting requirement
    
    4. **Processing Guidance**:
       - Prioritizes financial highlights/tables
       - Handles different currency symbols
       - Explains conflict resolution for multiple matches
    
    Example Outputs:
    
    1. When KPI is found in both:
    ```json
    {{
      "<auto_bank1>": "<value>",
      "<auto_bank2>": "<value>"
    }}```
    """

In [26]:
print(prompt)


   **Task:** Extract net income for 2025Q1 from two bank reports.
   The metric may appear in different forms across documents:
   - Short form (e.g., "EPS") 
   - Full form (e.g., "Earnings Per Share")
   - Common variations (e.g., "Diluted EPS", "Basic EPS" for EPS)


   ---

   **JPMorgan Chase (JPMC) Report:**
   JPMORGANCHASE REPORTS FIRST-QUARTER 2025 NET INCOME OF $14.6 BILLION ( $5.07  PER SHARE)
FIRST-QUARTER 2025 RESULTS 1
ROE 18%
ROTCE2 21% CET1 Capital Ratios3
Std. 15.4% | Adv.  15.5%
Total Loss-Absorbing Capacity3 $558BStd. RWA3 $1.8T
Cash and marketable securities4 $1.5T
Average loans  $1.3T  
Firmwide 
MetricsnReported revenue of $45.3 billion and managed revenue
of $46.0 billion2
nExpense of $23.6 billion; reported overhead ratio of
52% and managed overhead ratio2 of 51%
nCredit costs of $3.3 billion with $2.3 billion of net
charge-offs and a $973 million net reserve build
nAverage loans up 2% YoY, flat QoQ; average deposits
up 2% YoY, up 1% QoQ
CCB
ROE 31%nAverage dep

In [27]:
final_data = {
    "model": "llama3.2:3b",
    "messages": [
        { 
            "role": "system",
            "content": "You are a bank document processor that automatically identifies financial institutions and extracts data."
        },
        {
            "role": "user",
            "content": prompt
        }
    ],
    "response_format": {
        "type": "json_object",
        "schema": {
            "type": "object",
            "patternProperties": {
                "^[A-Z]{1,5}$": {  # Matches bank abbreviations (1-5 chars)
                    "type": ["string", "null"]
                }
            },
            "additionalProperties": False
        }
    },
    "temperature": 0.1,
    "stream": False
}
final_response = requests.post(OLLAMA_URL, json=final_data)
print(final_response.json())


{'model': 'llama3.2:3b', 'created_at': '2025-06-15T10:55:42.4704482Z', 'message': {'role': 'assistant', 'content': 'Here are the extracted financial highlights and tables from JPMorgan Chase\'s (JPM) 2025Q1 earnings report, formatted according to the specified output requirements:\n\n**JPMorgan Chase**\n\n```json\n{\n  "jpmc": {\n    "EPS": "$4.84",\n    "Diluted EPS": "$4.84",\n    "Basic EPS": "$3.56"\n  },\n  "ROCE": "13.4%",\n  "Return on Average Tangible Common Equity (RoTCE)": "9.1%",\n  "Net Income": "$32.8B",\n  "Revenues": "$121.6B",\n  "Total Assets": "$2.69 Trillion"\n}\n```\n\n**Citi**\n\n```json\n{\n  "citi": {\n    "EPS": "$1.96",\n    "Diluted EPS": "$1.96",\n    "Basic EPS": "$1.58"\n  },\n  "ROCE": "8.0%",\n  "Return on Average Common Equity (ROCE)": "5.8%",\n  "Net Income": "$4.1B",\n  "Revenues": "$21.6B",\n  "Total Assets": "$2.38 Trillion"\n}\n```\n\n**Key Metrics**\n\n```json\n{\n  "net-income-growth-rate": "42%",\n  "revenue-growth-rate": "3%",\n  " expense-reduc

In [28]:

# final_response = requests.post(
#     OLLAMA_URL,
#     json=final_data,
#     timeout=120
# )
# final_response.raise_for_status()  # Raises HTTPError for bad responses

# # 4. Process and validate the response
# response_data = final_response.json()

# if "message" in response_data and "content" in response_data["message"]:
#     try:
#         extracted_data = json.loads(response_data["message"]["content"])
#         print("Extracted Data:")
#         print(json.dumps(extracted_data, indent=2))
#     except json.JSONDecodeError:
#         print("Error: Invalid JSON received")
#         print("Raw response:", response_data["message"]["content"])
# else:
#     print("Unexpected response format:", response_data)



In [29]:
# print(response_data)


In [30]:

followup_messages = [
    {"role": "system", "content": "You are an expert analyst in analysing financial data"},
    {"role": "user", "content": prompt}]

final_data = {
    "model": "llama3.2:3b",
    "messages": followup_messages,
    "temperature": 0.1,  # Lower for less creativity
    "response_format": {"type": "json_object"}, 
    "stream": False,
    "format":{
        "type":"object",
        "properties":{
            "KPI bank1":{
                "type":"string"},
        "KPI bank2":{
                "type":"string"}},"required":["KPI bank1","KPI bank2"]},
}

final_response = requests.post(OLLAMA_URL, json=final_data)
print(final_response.json())


{'model': 'llama3.2:3b', 'created_at': '2025-06-15T11:00:56.7608296Z', 'message': {'role': 'assistant', 'content': '{\n  "KPI bank1": "{ \'EPS\': 1.96, \'QoQ%\': 11, \'YoY%\': 3, \'unit\': \'$\' }",\n  "KPI bank2": "{ \'RoTCE\': 9.1, \'QoQ%\': 300, \'unit\': \'%\' }"\n}\n  '}, 'done_reason': 'stop', 'done': True, 'total_duration': 312227007300, 'load_duration': 59601800, 'prompt_eval_count': 4096, 'prompt_eval_duration': 299750617900, 'eval_count': 79, 'eval_duration': 12374041300}


In [31]:

followup_messages = [
    {"role": "system", "content": "You are an expert analyst in analysing financial data"},
    {"role": "user", "content": prompt}]

final_data = {
    "model": "llama3.2:3b",
    "messages": followup_messages,
    "temperature": 0.1,  # Lower for less creativity
    "response_format": {"type": "json_object"}, 
    "stream": False,
    
}

final_response = requests.post(OLLAMA_URL, json=final_data)
print(final_response.json())


{'model': 'llama3.2:3b', 'created_at': '2025-06-15T11:06:36.5740894Z', 'message': {'role': 'assistant', 'content': 'Here are the extracted financial highlights and tables for JPMorgan Chase and Citigroup, following the specified rules:\n\n**JPMorgan Chase**\n\n* Net Income: $4.8 billion (up 18% from Q1 2024)\n* EPS: $3.52 (up 24% from Q1 2024)\n* Return on Average Tangible Common Equity (RoTCE): 9.5%\n* Total Allowance for Credit Losses: $23.6 billion\n* CET1 Capital Ratio: 13.3%\n\n**Citigroup**\n\n* Net Income: $4.1 billion (down 21% from Q1 2024)\n* EPS: $1.96 (up 24% from Q1 2024)\n* Return on Average Common Equity (ROCE): 8.0%\n* Total Allowance for Credit Losses: $22.8 billion\n* CET1 Capital Ratio: 13.4%\n\nHere is the output in JSON format:\n\n```json\n{\n  "jpmc": {\n    "Net Income": 4800000000,\n    "EPS": 3.52,\n    "RoTCE": 9.5,\n    "Total Allowance for Credit Losses": 23600000000,\n    "CET1 Capital Ratio": 13.3\n  },\n  "citi": {\n    "Net Income": 4100000000,\n    "EPS

In [32]:

followup_messages = [
    {"role": "system", "content": "You are an expert analyst in analysing financial data"},
    {"role": "user", "content": prompt}]

final_data = {
    "model": "llama3.2:3b",
    "messages": followup_messages,
    "response_format": {"type": "json_object"}, 
    "stream": False,
    
}

final_response = requests.post(OLLAMA_URL, json=final_data)
print(final_response.json())


{'model': 'llama3.2:3b', 'created_at': '2025-06-15T11:14:22.2422907Z', 'message': {'role': 'assistant', 'content': 'Here are the results of searching for 2025Q1 (and its equivalents) in Citigroup\'s Q1 2025 press release, along with the requested financial highlights and tables:\n\n**JPMorgan Chase**\n\n1. **EPS**: $4.96\n2. **Diluted EPS**: $4.94\n3. **Net Income**: $28.9B\n4. **Return on Average Tangible Common Equity (RoTCE)**: 8.6%\n5. **Return on Earnings (RoE)**: N/A\n6. **Cost of Credit**: 2.1%\n\n**Table: Q1 2025 Financial Highlights**\n\n| Category | Value |\n| --- | --- |\n| Net Interest Income | $53.9B |\n| Non-Interest Income | $14.8B |\n| Total Revenue | $68.7B |\n| Net Income | $28.9B |\n| Return on Average Tangible Common Equity (RoTCE) | 8.6% |\n\n**Table: Q1 2025 Balance Sheet**\n\n| Account | Value |\n| --- | --- |\n| Total Assets | $3.43T |\n| EOP Loans | $2.63T |\n| EOP Deposits | $1.53T |\n| EOP Equity | $2.85T |\n\n**Table: Q1 2025 Income Statement**\n\n| Category

In [50]:
import requests
import json

# Debugged and improved version
followup_messages = [
    {
        "role": "system", 
        "content": "You are an expert financial analyst that outputs strictly formatted JSON data."
    },
    {
        "role": "user", 
        "content": prompt
    }
]

# Create dynamic property names
kpi_url1 = f"{f_args['kpi']} for {f_args['ur1']}"
kpi_url2 = f"{f_args['kpi']} for {f_args['ur2']}"

final_data = {
    "model": "llama3.2:3b",
    "messages": followup_messages,
    "response_format": {
        "type": "json_object",
        "schema": {
            "type": "object",
            "properties": {
                kpi_url1: {"type": "string"},
                kpi_url2: {"type": "string"}
            },
            "required": [kpi_url1, kpi_url2],
            "additionalProperties": False
        }
    },
    "stream": False
}

try:
    # Make the API request
    final_response = requests.post(
        OLLAMA_URL,
        json=final_data,
        timeout=120
    )
    final_response.raise_for_status()  # Check for HTTP errors
    
    # Process and validate the response
    response_json = final_response.json()
    
    if "message" in response_json and "content" in response_json["message"]:
        try:
            # Parse and pretty-print the JSON response
            parsed_data = json.loads(response_json["message"]["content"])
            print(json.dumps(parsed_data, indent=2))
        except json.JSONDecodeError:
            print("Error: Invalid JSON received")
            print("Raw response:", response_json["message"]["content"])
    else:
        print("Unexpected response format:", response_json)

except requests.exceptions.RequestException as e:
    print(f"API request failed: {str(e)}")
except Exception as e:
    print(f"Unexpected error: {str(e)}")

API request failed: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=120)


providing one merged file (jpmc+citi) as input instead of 2 sepearte text files 

In [13]:
 prompt = f"""
    **Task:** Extract {f_args['kpi']} for {f_args['quarter']} from two bank reports.
    The metric may appear in different forms across documents:
    - Short form (e.g., "EPS") 
    - Full form (e.g., "Earnings Per Share")
    - Common variations (e.g., "Diluted EPS", "Basic EPS" for EPS)


    ---

    **Document**
    {pdf}

    ---

    **Rules:**
    1. Search for {f_args['quarter']} (accept equivalents: "Q1 2025", "first quarter", "Jan-Mar 2025")
    2. Recognize all forms of the KPI (e.g., both "EPS" and "Earnings Per Share")
    3. Focus on financial highlights and tables first
    4. Return "not found" if unavailable

    **Output Format (JSON):**
    {{
      "jpmc": "<value or 'not found'>",
      "citi": "<value or 'not found'>"
    }}
    
    Key improvements:
    
    1. **Term Variation Handling**:
       - Explicitly mentions short/full forms (EPS vs Earnings Per Share)
       - Includes common financial variants (Diluted/Basic EPS)
    
    2. **Clear Examples**:
       - Shows exactly how different formats should be parsed
       - Demonstrates quarter name variations
    
    3. **Enhanced Output**:
       - Added "unit" field to capture currency consistency
       - Strict JSON formatting requirement
    
    4. **Processing Guidance**:
       - Prioritizes financial highlights/tables
       - Handles different currency symbols
       - Explains conflict resolution for multiple matches
    
    Example Outputs:
    
    1. When KPI is found in both:
    ```json
    {{
      "<auto_bank1>": "<value>",
      "<auto_bank2>": "<value>"
    }}```
    """

In [14]:
print(prompt[:400])


   **Task:** Extract net income for 2025Q1 from two bank reports.
   The metric may appear in different forms across documents:
   - Short form (e.g., "EPS") 
   - Full form (e.g., "Earnings Per Share")
   - Common variations (e.g., "Diluted EPS", "Basic EPS" for EPS)


   ---

   **Document**
    
  1 
 Citi CEO Jane Fraser said, “With 
net income of $4.1 billion we 
delivered a strong quarter, 


In [54]:

followup_messages = [
    {"role": "system", "content": "You are an expert analyst in analysing financial data"},
    {"role": "user", "content": prompt}]

final_data = {
    "model": "llama3.2:3b",
    "messages": followup_messages,
    "response_format": {"type": "json_object"}, 
    "stream": False,
    
}

final_response = requests.post(OLLAMA_URL, json=final_data)
print(final_response.json())


{'model': 'llama3.2:3b', 'created_at': '2025-06-14T17:11:39.8894848Z', 'message': {'role': 'assistant', 'content': 'Here are the financial highlights and tables for JPMorgan Chase & Co. and Citigroup Inc. for Q1 2025, formatted according to the specified rules:\n\n**JPMorgan Chase & Co.**\n\n* Net income: $14.6 billion (up 9%)\n* Net revenue: $46.0 billion (up 8%)\n* Noninterest revenue: $22.6 billion (up 17%)\n\n**Discussion of Results**\n\nNet interest income excluding Markets2 was $22.6 billion, down 2%, driven by the impact of lower rates and deposit margin compression as well as lower deposit balances in CCB. These were predominantly offset by higher revolving balances in Card Services, the impact of securities activity including activity in prior quarters, as well as higher wholesale deposit balances.\n\nNoninterest revenue excluding Markets2 was $13.8 billion, up 20%. Excluding the $588 million First Republic-related gain, noninterest revenue excluding Markets2 was up 14%, large

Qwen 2.5 model 

In [15]:

followup_messages = [
    {"role": "system", "content": "You are an expert analyst in analysing financial data"},
    {"role": "user", "content": prompt}]

final_data = {
    "model": "qwen2.5:latest",
    "messages": followup_messages,
    "response_format": {"type": "json_object"}, 
    "stream": False,
    
}

final_response = requests.post(OLLAMA_URL, json=final_data)
print(final_response.json())


{'model': 'qwen2.5:latest', 'created_at': '2025-06-15T04:06:31.2626541Z', 'message': {'role': 'assistant', 'content': '```json\n{\n  "jpmc": "5.07",\n  "citi": "not found"\n}\n```\n\n### Explanation:\n- **JPMorgan Chase (JPMC)**: \n  - Found in the text as "Diluted EPS - $ 5.07".\n  \n- **Citi**:\n  - Not mentioned or found in any form within the provided text.\n\nThe output is strictly JSON formatted and adheres to the specified rules, providing clear financial highlights for JPMorgan Chase while indicating that Citi was not found in the given content.'}, 'done_reason': 'stop', 'done': True, 'total_duration': 2122710646700, 'load_duration': 66169900, 'prompt_eval_count': 4096, 'prompt_eval_duration': 2077748900500, 'eval_count': 121, 'eval_duration': 44775390400}


In [33]:
prompt = f"""
Task: Extract the {f_args['kpi']} for {f_args['quarter']} from the financial reports of JPMorgan Chase and Citigroup.

- Recognize variations like "EPS", "Earnings Per Share", "Diluted EPS", and "Basic EPS".
- Accept quarter names like "Q1 2025", "first quarter", "Jan-Mar 2025".
- Focus only on financial highlights, summary tables, and headline metrics.

---

JPMorgan Chase Report:
{pdf2_text}

---

Citigroup Report:
{pdf1_text}

---

Rules:
1. Search for values specifically tied to {f_args['quarter']}.
2. Only return numeric values if clearly labeled.
3. If no value is found, return "not found".

Respond ONLY in this strict JSON format:
{{
  "jpmc": "<value or 'not found'>",
  "citi": "<value or 'not found'>"
}}
"""


In [34]:
headers = {"Content-Type": "application/json"}
final_data = {
    "model": "llama3.2:3b",
    "messages": [
        {"role": "system", "content": "You are a financial data extraction engine. Output valid JSON only."},
        {"role": "user", "content": prompt}
    ],
    "temperature": 0.1,
    "stream": False
}

response = requests.post(OLLAMA_URL, json=final_data, headers=headers)
print(response.json())


{'model': 'llama3.2:3b', 'created_at': '2025-06-15T12:35:55.4543867Z', 'message': {'role': 'assistant', 'content': '{\n  "jpmc": {\n    "book_value_per_share": 103.90,\n    "tangible_book_value_per_share": 91.52,\n    "capital_ratio": 13.4,\n    "cost_of_credit": 2.7,\n    "net_income": 4.1\n  },\n  "citi": {\n    "return_on_average_common_equity_roce": 8.0,\n    "total_allowance_for_credit_losses": 22.8,\n    "total_acl_on_loans": 18.7,\n    "revenue_increase_rate": 3%,\n    "net_income_attribution_to_non_controlling_interest": 702,\n    "eop_deposits": 3.802\n  }\n}'}, 'done_reason': 'stop', 'done': True, 'total_duration': 430570413900, 'load_duration': 4408826500, 'prompt_eval_count': 4096, 'prompt_eval_duration': 389704881300, 'eval_count': 160, 'eval_duration': 36421045000}


In [37]:
 prompt = f"""
    **Task:** Extract {f_args['kpi']} for {f_args['quarter']} from two bank reports.
    The metric may appear in different forms across documents:
    - Short form (e.g., "EPS") 
    - Full form (e.g., "Earnings Per Share")
    - Common variations (e.g., "Diluted EPS", "Basic EPS" for EPS)


    ---

    **Document**
    <JPMC earnings> {pdf2_text} </JPMC earnings>
    <Citi earnings> {pdf1_text} </Citi earnings>
    

    ---

    **Rules:**
    1. Search for {f_args['quarter']} (accept equivalents: "Q1 2025", "first quarter", "Jan-Mar 2025")
    2. Recognize all forms of the KPI (e.g., both "EPS" and "Earnings Per Share")
    3. Focus on financial highlights and tables first
    4. Return "not found" if unavailable

    **Output Format (JSON):**
    {{
      "jpmc": "<value or 'not found'>",
      "citi": "<value or 'not found'>"
    }}
    
    Key improvements:
    
    1. **Term Variation Handling**:
       - Explicitly mentions short/full forms (EPS vs Earnings Per Share)
       - Includes common financial variants (Diluted/Basic EPS)
    
    2. **Clear Examples**:
       - Shows exactly how different formats should be parsed
       - Demonstrates quarter name variations
    
    3. **Enhanced Output**:
       - Added "unit" field to capture currency consistency
       - Strict JSON formatting requirement
    
    4. **Processing Guidance**:
       - Prioritizes financial highlights/tables
       - Handles different currency symbols
       - Explains conflict resolution for multiple matches
    
    Example Outputs:
    
    1. When KPI is found in both:
    ```json
    {{
      "<auto_bank1>": "<value>",
      "<auto_bank2>": "<value>"
    }}```
    """

prompt[:500]

In [38]:
prompt[:400]

'\n   **Task:** Extract net income for 2025Q1 from two bank reports.\n   The metric may appear in different forms across documents:\n   - Short form (e.g., "EPS") \n   - Full form (e.g., "Earnings Per Share")\n   - Common variations (e.g., "Diluted EPS", "Basic EPS" for EPS)\n\n\n   ---\n\n   **Document**\n   <JPMC earnings> JPMORGANCHASE REPORTS FIRST-QUARTER 2025 NET INCOME OF $14.6 BILLION ( $5.07  PER SHA'

In [39]:
headers = {"Content-Type": "application/json"}
final_data = {
    "model": "llama3.2:3b",
    "messages": [
        {"role": "system", "content": "You are a financial data extraction engine. Output valid JSON only."},
        {"role": "user", "content": prompt}
    ],
    "temperature": 0.1,
    "stream": False
}

response = requests.post(OLLAMA_URL, json=final_data, headers=headers)
print(response.json())


{'model': 'llama3.2:3b', 'created_at': '2025-06-15T12:53:47.1516811Z', 'message': {'role': 'assistant', 'content': 'Here are the extracted financial highlights and tables from the JPMorgan Chase and Citigroup earnings reports, formatted as JSON objects:\n\n**JPMorgan Chase**\n\n```json\n{\n  "jpmc": {\n    "EPS": 3.38,\n    "ROCE": 12.7%\n  }\n}\n```\n\nFinancial Highlights:\n\n* Net income: $30.51B\n* Total revenue: $114.96B\n* Total expenses: $83.45B\n* Return on average tangible common equity (RoTCE): 11.8%\n\n**Citigroup**\n\n```json\n{\n  "citi": {\n    "EPS": 1.96,\n    "ROCE": 8.0%\n  }\n}\n```\n\nFinancial Highlights:\n\n* Net income: $4.1B\n* Total revenue: $21.6B\n* Total expenses: $13.4B\n* Return on average common equity (RoTCE): 9.1%'}, 'done_reason': 'stop', 'done': True, 'total_duration': 498939590200, 'load_duration': 4524877800, 'prompt_eval_count': 4096, 'prompt_eval_duration': 438022291100, 'eval_count': 204, 'eval_duration': 56372788000}


In [40]:
headers = {"Content-Type": "application/json"}
final_data = {
    "model": "gemma3",
    "messages": [
        {"role": "system", "content": "You are a financial data extraction engine. Output valid JSON only."},
        {"role": "user", "content": prompt}
    ],
    "temperature": 0.1,
    "stream": False
}

response = requests.post(OLLAMA_URL, json=final_data, headers=headers)
print(response.json())


{'model': 'gemma3', 'created_at': '2025-06-15T13:02:36.8710572Z', 'message': {'role': 'assistant', 'content': '```json\n{\n  "jpmc": "Revenues $21,596B, Net Income $4.1B, ROCE 8.0%, RoTCE 9.1%, CET1 13.4%, EPS $1.96",\n  "citi": "Revenues $21.6B, Net Income $4.1B, EPS $1.96, RoTCE 9.1%, CET1 13.4%"\n}\n```'}, 'done_reason': 'stop', 'done': True, 'total_duration': 493723661700, 'load_duration': 162720900, 'prompt_eval_count': 4096, 'prompt_eval_duration': 467196945900, 'eval_count': 113, 'eval_duration': 26230050800}


In [50]:
import requests
import PyPDF2
from typing import Dict, Any, Optional

class DocumentProcessor:
    def __init__(self, ollama_url: str = "http://localhost:11434/api/chat"):
        self.ollama_url = ollama_url
        self.tools = self.define_tools()
    def define_tools(self) -> list:
        return [
            {
                "type": "function",
                "function": {
                    "name": "get_pdf_parameters",
                    "description": "Extracts PDF paths and KPIs from a query",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "ur1": {"type": "string", "description": "Path to PDF 1"},
                            "ur2": {"type": "string", "description": "Path to PDF 2"},
                            "kpi": {"type": "string", "description": "Metric to extract (e.g., 'Net Income')"},
                            "quarter": {"type": "string", "description": "Quarter (e.g., '2025Q1')"}
                        },
                        "required": ["ur1", "ur2", "kpi", "quarter"]
                    }
                }
            },
            
        ]


    def invoke_function(self, query: str) -> Optional[Dict[str, Any]]:
 
        data = {
            "model": "llama3.2:3b",
            "messages": [{"role": "user", "content": query}],
            "tools": self.tools,
            "stream": False
        }
        try:
            response = requests.post(self.ollama_url, json=data).json()
            return response
        except Exception as e:
            print(f"Error invoking AI: {e}")
            return None

    
    def model(self, query: str) -> Dict[str, str]:
        """Orchestrates the full workflow: 
        1. Invokes AI to get PDF paths/KPIs 
        2. Extracts text from PDFs 
        3. Returns structured data."""
        # Step 1: Get PDF parameters from AI
        ai_response = self.invoke_function(query)
        if not ai_response:
            return {"error": "AI invocation failed"}

        # Extract function call details
        try:
            func_name = ai_response["message"]["tool_calls"][0]["function"]["name"]
            func_args = (ai_response["message"]["tool_calls"][0]["function"]["arguments"])
        except (KeyError, SyntaxError):
            return {"error": "Failed to parse AI response"}

        # Step 2: Extract text from PDFs
        pdf1_text = self._pdf_to_text(func_args["ur1"])
        pdf2_text = self._pdf_to_text(func_args["ur2"])

        # Step 3: Generate final prompt for metric extraction
        prompt = f"""
        **Task:** Extract {f_args['kpi']} for {f_args['quarter']} from two bank reports.
        The metric may appear in different forms across documents:
        - Short form (e.g., "EPS") 
        - Full form (e.g., "Earnings Per Share")
        - Common variations (e.g., "Diluted EPS", "Basic EPS" for EPS)
    
    
        ---
    
        **Document**
        <JPMC earnings> {pdf2_text} </JPMC earnings>
        <Citi earnings> {pdf1_text} </Citi earnings>
        
    
        ---
    
        **Rules:**
        1. Search for {f_args['quarter']} (accept equivalents: "Q1 2025", "first quarter", "Jan-Mar 2025")
        2. Recognize all forms of the KPI (e.g., both "EPS" and "Earnings Per Share")
        3. Focus on financial highlights and tables first
        4. Return "not found" if unavailable
    
        **Output Format (JSON):**
        {{
          "jpmc": "<value or 'not found'>",
          "citi": "<value or 'not found'>"
        }}
        
        Key improvements:
        
        1. **Term Variation Handling**:
           - Explicitly mentions short/full forms (EPS vs Earnings Per Share)
           - Includes common financial variants (Diluted/Basic EPS)
        
        2. **Clear Examples**:
           - Shows exactly how different formats should be parsed
           - Demonstrates quarter name variations
        
        3. **Enhanced Output**:
           - Added "unit" field to capture currency consistency
           - Strict JSON formatting requirement
        
        4. **Processing Guidance**:
           - Prioritizes financial highlights/tables
           - Handles different currency symbols
           - Explains conflict resolution for multiple matches
        
        Example Outputs:
        
        1. When KPI is found in both:
        ```json
        {{
          "<auto_bank1>": "<value>",
          "<auto_bank2>": "<value>"
        }}```
        
        """
        final_response = self.invoke_function(prompt)
        return final_response.get("message", {})


    def _pdf_to_text(self, path: str) -> str:
        """Converts a PDF file to text."""
        try:
            with open(path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                return "\n".join(page.extract_text() or "" for page in reader.pages)
        except Exception as e:
            print(f"Error reading PDF: {e}")
            return ""

# Example Usage
if __name__ == "__main__":
    processor = DocumentProcessor()
    query = "What is the net income for 2025Q1 from citi.pdf and jpmc.pdf?"
    result = processor.model(query)
    print(result)

{'role': 'assistant', 'content': 'Here are the extracted financial highlights and tables from the Citi earnings report, formatted as requested:\n\n**Financial Highlights:**\n\n* Net Income Available to Common Shareholders: $<value>\n* Adjusted Diluted Earnings Per Share (EPS): $<value>\n* Return on Average Tangible Common Equity (RoTCE): <value>\n\n**Tables:**\n\n1. **Net Income by Segment**\n```markdown\n| Segment | Net Income |\n| --- | --- |\n| Corporate Lending | $<value> |\n| Markets | $<value> |\n| Wealth | $<value> |\n| USPB | $<value> |\n| All Other | $<value> |\n```\n\n2. **Average Tangible Common Equity (TCE) by Segment**\n```markdown\n| Segment | Average TCE |\n| --- | --- |\n| Services | $24.7B |\n| Markets | $50.4B |\n| Banking | $20.6B |\n| Wealth | $12.3B |\n| USPB | $23.4B |\n| All Other | $37.9B |\n```\n\n3. **Revenue by Segment (non-GAAP)**\n```markdown\n| Segment | Revenue |\n| --- | --- |\n| Corporate Lending | $<value> |\n| Markets | $<value> |\n| Wealth | $<value>

In [1]:
import json
import re
from typing import Dict, Any, Optional, List
import requests
from dataclasses import dataclass
from enum import Enum

class ActionType(Enum):
    READ_TEXT_FILE = "read_text_file"
    SEARCH_TEXT = "search_text"
    EXTRACT_KPI = "extract_kpi"
    VALIDATE_RESULT = "validate_result"
    FINAL_ANSWER = "final_answer"

@dataclass
class ReActStep:
    thought: str
    action: ActionType
    action_input: Dict[str, Any]
    observation: str

class TextFileReader:
    """Tool for reading text file content"""
    
    def read_text_file(self, file_path: str) -> str:
        """Extract text content from text file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                return text
        except Exception as e:
            return f"Error reading text file: {str(e)}"

class LLMTextSearcher:
    """Tool for searching KPI content using LLM"""
    
    def __init__(self, llama_client):
        self.llama_client = llama_client
    
    def search_kpi_context(self, text: str, kpi_name: str) -> str:
        """Use LLM to search for KPI-related content in text"""
        
        prompt = f"""
You are an expert financial document analyzer. Your task is to find all sections of text that contain information about the specified KPI.

KPI to search for: {kpi_name}

Document text:
{text}

Instructions:
1. Carefully read through the entire document
2. Identify ALL sections that mention or contain data about "{kpi_name}"
3. For each relevant section, extract the complete context (including surrounding sentences)
4. Look for variations like: net income, Net Income, NET INCOME, earnings, profit, etc.
5. Include tables, bullet points, and any numerical data related to this KPI
6. Return ONLY the relevant text sections, separated by "---SECTION---"
7. If no relevant sections found, return "NO_RELEVANT_SECTIONS_FOUND"

Relevant sections:"""
        
        return self.llama_client.generate_response(prompt, max_tokens=1000)

class LlamaClient:
    """Client for interacting with Llama 3.2 via OpenChat API"""
    
    def __init__(self, api_endpoint: str, model_name: str = "llama3.2:3b"):
        self.api_endpoint = api_endpoint
        self.model_name = model_name
    
    def generate_response(self, prompt: str, max_tokens: int = 500) -> str:
        """Generate response using Ollama-compatible Llama3.2 API"""
        try:
            final_data = {
                "model": self.model_name,      # e.g., "llama3"
                "prompt": prompt,              # Ollama expects "prompt" instead of "messages"
                "stream": False,
                "options": {
                    "temperature": 0.1,
                    "num_predict": max_tokens
                }
            }
            headers = {
                "Content-Type": "application/json"
            }
    
            response = requests.post(self.api_endpoint + "/api/generate", json=final_data, headers=headers)
    
            if response.status_code == 200:
                result = response.json()
                return result['response'].strip()
            else:
                return f"API Error: {response.status_code} - {response.text}"
    
        except Exception as e:
            return f"Error calling Llama API: {str(e)}"



class KPIExtractor:
    """Tool for extracting specific KPI values from text"""
    
    def __init__(self, llama_client):
        self.llama_client = llama_client
    
    def extract_kpi_value(self, context: str, kpi_name: str) -> Dict[str, Any]:
        """Extract specific KPI value from context using LLM"""
        
        prompt = f"""
You are a financial data extraction expert. Extract the specific KPI value from the given context.

KPI Name: {kpi_name}
Context: {context}

Instructions:
1. Find the exact numerical value for "{kpi_name}"
2. Include the unit (e.g., millions, billions, USD, etc.)
3. Include the time period (e.g., Q1 2024, FY 2023, etc.)
4. If multiple values exist, extract all of them
5. Return ONLY valid JSON format

Required JSON format:
{{
    "kpi_name": "{kpi_name}",
    "values": [
        {{
            "value": "numerical_value",
            "unit": "unit_if_any",
            "period": "time_period",
            "context_snippet": "brief_surrounding_text"
        }}
    ],
    "found": true/false,
    "confidence": "high/medium/low"
}}

JSON Response:"""
        
        response = self.llama_client.generate_response(prompt, max_tokens=800)
        
        try:
            # Clean and parse JSON response
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
            else:
                return {"found": False, "error": "No valid JSON found in response"}
        except json.JSONDecodeError as e:
            return {"found": False, "error": f"JSON parsing error: {str(e)}"}

class ResultValidator:
    """Tool for validating extracted KPI results"""
    
    def __init__(self, llama_client):
        self.llama_client = llama_client
    
    def validate_kpi_extraction(self, original_context: str, extracted_result: Dict[str, Any]) -> Dict[str, Any]:
        """Validate the extracted KPI result against original context"""
        
        prompt = f"""
You are a quality assurance expert for financial data extraction. Validate if the extracted KPI data is accurate.

Original Context:
{original_context}

Extracted Result:
{json.dumps(extracted_result, indent=2)}

Validation Tasks:
1. Check if the extracted values are present in the original context
2. Verify if the units and periods are correct
3. Confirm if all instances of the KPI were captured
4. Assess the overall accuracy

Return validation result in JSON format:
{{
    "is_valid": true/false,
    "accuracy_score": 0.0-1.0,
    "issues_found": ["list of any issues"],
    "missing_values": ["any missed KPI instances"],
    "recommendations": ["suggestions for improvement"]
}}

Validation Result:"""
        
        response = self.llama_client.generate_response(prompt, max_tokens=600)
        
        try:
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
            else:
                return {"is_valid": False, "error": "No valid JSON found in validation response"}
        except json.JSONDecodeError as e:
            return {"is_valid": False, "error": f"Validation JSON parsing error: {str(e)}"}

class ReActKPIAgent:
    """Main ReAct agent for KPI extraction"""
    
    def __init__(self, llama_api_endpoint: str):
        self.llama_client = LlamaClient(api_endpoint=llama_api_endpoint)
        self.text_reader = TextFileReader()
        self.text_searcher = LLMTextSearcher(self.llama_client)
        self.kpi_extractor = KPIExtractor(self.llama_client)
        self.validator = ResultValidator(self.llama_client)
        self.steps: List[ReActStep] = []
    
    def extract_kpi_from_file(self, file_path: str, kpi_name: str) -> Dict[str, Any]:
        """Main method to extract KPI using ReAct framework"""
        
        # Step 1: Read the text file
        thought = f"I need to extract '{kpi_name}' from the file '{file_path}'. First, I'll read the file content."
        action = ActionType.READ_TEXT_FILE
        action_input = {"file_path": file_path}
        
        observation = self.text_reader.read_text_file(file_path)
        self.steps.append(ReActStep(thought, action, action_input, observation))
        
        if "Error" in observation:
            return {"error": observation, "steps": self.steps}
        
        # Step 2: Search for KPI-related content
        thought = f"File read successfully. Now I'll search for sections containing information about '{kpi_name}'."
        action = ActionType.SEARCH_TEXT
        action_input = {"text": observation, "kpi_name": kpi_name}
        
        kpi_context = self.text_searcher.search_kpi_context(observation, kpi_name)
        self.steps.append(ReActStep(thought, action, action_input, kpi_context))
        
        if "NO_RELEVANT_SECTIONS_FOUND" in kpi_context:
            return {"found": False, "message": f"No sections found containing '{kpi_name}'", "steps": self.steps}
        
        # Step 3: Extract KPI values
        thought = f"Found relevant sections. Now I'll extract the specific '{kpi_name}' values from the context."
        action = ActionType.EXTRACT_KPI
        action_input = {"context": kpi_context, "kpi_name": kpi_name}
        
        extracted_result = self.kpi_extractor.extract_kpi_value(kpi_context, kpi_name)
        self.steps.append(ReActStep(thought, action, action_input, str(extracted_result)))
        
        if not extracted_result.get("found", False):
            return {"found": False, "message": f"Could not extract '{kpi_name}' values", "steps": self.steps}
        
        # Step 4: Validate results
        thought = "KPI values extracted. Now I'll validate the accuracy of the extraction."
        action = ActionType.VALIDATE_RESULT
        action_input = {"original_context": kpi_context, "extracted_result": extracted_result}
        
        validation_result = self.validator.validate_kpi_extraction(kpi_context, extracted_result)
        self.steps.append(ReActStep(thought, action, action_input, str(validation_result)))
        
        # Step 5: Final answer
        thought = "Validation complete. Preparing final answer with extracted KPI data."
        action = ActionType.FINAL_ANSWER
        
        final_result = {
            "kpi_name": kpi_name,
            "extraction_successful": True,
            "extracted_data": extracted_result,
            "validation": validation_result,
            "confidence": extracted_result.get("confidence", "medium"),
            "processing_steps": len(self.steps)
        }
        
        action_input = final_result
        self.steps.append(ReActStep(thought, action, action_input, "Final KPI extraction completed"))
        
        return final_result
    
    def get_processing_steps(self) -> List[Dict[str, Any]]:
        """Return all processing steps for transparency"""
        return [
            {
                "step": i + 1,
                "thought": step.thought,
                "action": step.action.value,
                "action_input": step.action_input,
                "observation": step.observation[:200] + "..." if len(step.observation) > 200 else step.observation
            }
            for i, step in enumerate(self.steps)
        ]

# Usage Example
def main():
    """Example usage of the ReAct KPI extraction system"""
    
    # Initialize the agent
    llama_endpoint = "http://localhost:11434"  # Replace with your Llama API endpoint
    agent = ReActKPIAgent(llama_endpoint)
    
    # Extract KPI from file
    file_path = "C:/Users/Akshaya V/git/CG/citi.txt"
    kpi_name = "Net Income"
    
    print(f"Extracting '{kpi_name}' from '{file_path}'...")
    print("=" * 50)
    
    result = agent.extract_kpi_from_file(file_path, kpi_name)
    
    # Display results
    if result.get("extraction_successful"):
        print(" KPI Extraction Successful!")
        print(f"KPI: {result['kpi_name']}")
        print(f"Confidence: {result['confidence']}")
        
        if "values" in result["extracted_data"]:
            print("\n Extracted Values:")
            for i, value in enumerate(result["extracted_data"]["values"], 1):
                print(f"  {i}. Value: {value.get('value', 'N/A')}")
                print(f"     Unit: {value.get('unit', 'N/A')}")
                print(f"     Period: {value.get('period', 'N/A')}")
                print(f"     Context: {value.get('context_snippet', 'N/A')[:100]}...")
                print()
        
        # Display validation results
        validation = result.get("validation", {})
        print(f" Validation - Valid: {validation.get('is_valid', False)}")
        print(f" Accuracy Score: {validation.get('accuracy_score', 0.0)}")
        
        if validation.get("issues_found"):
            print("  Issues Found:")
            for issue in validation["issues_found"]:
                print(f"   - {issue}")
    
    else:
        print(" KPI Extraction Failed")
        print(f"Reason: {result.get('message', 'Unknown error')}")
    
    # Display processing steps
    print("\n Processing Steps:")
    steps = agent.get_processing_steps()
    for step in steps:
        print(f"Step {step['step']}: {step['action'].upper()}")
        print(f"Thought: {step['thought']}")
        print(f"Observation: {step['observation']}")
        print("-" * 30)

if __name__ == "__main__":
    main()

Extracting 'Net Income' from 'C:/Users/Akshaya V/git/CG/citi.txt'...
✅ KPI Extraction Successful!
KPI: Net Income
Confidence: high

📊 Extracted Values:
  1. Value: $4.1 billion
     Unit: billion USD
     Period: Q1 2025
     Context: Citigroup Inc. today reported net income for the first quarter 2025 of $4.1 billion, or $1.96 per di...

🔍 Validation - Valid: True
🎯 Accuracy Score: 0.95

🔄 Processing Steps:
Step 1: READ_TEXT_FILE
Thought: I need to extract 'Net Income' from the file 'C:/Users/Akshaya V/git/CG/citi.txt'. First, I'll read the file content.
Observation: 1
Citi CEO Jane Fraser said, “With
net income of $4.1 billion we
delivered a strong quarter,
marked by continued
momentum, positive operating
leverage and improved returns
in each of our five business...
------------------------------
Step 2: SEARCH_TEXT
Thought: File read successfully. Now I'll search for sections containing information about 'Net Income'.
Observation: ---SECTION---

For Immediate Release
Citigroup Inc. (

In [2]:
print("hu")

hu


In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import TextLoader
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain.agents.agent_types import AgentType
from langchain.memory import ConversationBufferMemory
from langchain_core.tools import tool

# 🔧 MultiPromptQA class definition
class MultiPromptQA:
    def __init__(self, file_paths: list, embedding_model: str, llm_model: str):
        self.file_paths = file_paths
        self.embedding_model = embedding_model
        self.llm_model = llm_model

        self.docs = self.load_and_split_all()
        self.db = self.create_vectorstore()
        self.llm = ChatOllama(model=self.llm_model)
        self.retriever = self.create_multi_query_retriever()
        self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever)

    def load_and_split_all(self):
        all_split_docs = []
        embedding = OllamaEmbeddings(model=self.embedding_model)
        splitter = SemanticChunker(embedding)

        for path in self.file_paths:
            loader = TextLoader(path, encoding='utf-8')
            docs = loader.load()
            split_docs = splitter.split_documents(docs)
            all_split_docs.extend(split_docs)

        return all_split_docs

    def create_vectorstore(self):
        embedding = OllamaEmbeddings(model=self.embedding_model)
        return FAISS.from_documents(self.docs, embedding)

    def create_multi_query_retriever(self):
        return MultiQueryRetriever.from_llm(
            retriever=self.db.as_retriever(),
            llm=self.llm,
            include_original=True
        )

    def ask(self, question: str) -> str:
        return self.qa_chain.run(question)


# ✅ Tool 1: Basic file reader
@tool
def read_file(bank_name: str) -> str:
    """Reads the contents of a local text file. Input should be the name of the bank. Example: 'citi.txt' or 'jpmc.txt'"""
    try:
        file = f"C:/Users/Akshaya V/git/CG/{bank_name}"
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    except Exception as e:
        return f"Error reading file: {str(e)}"


# ✅ Tool 2: Wrap MultiPromptQA as a Tool
rag_qa = MultiPromptQA(
    file_paths=[
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/citi.txt",
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/jpmc.txt"
    ],
    embedding_model="qwen2.5:0.5b",
    llm_model="qwen2.5:7b"
)

rag_tool = Tool(
    name="earnings_qa_tool",
    func=rag_qa.ask,
    description="Use this tool to answer questions related to quarterly earnings of Citi and JPMorgan based on financial reports."
)

# ✅ Final tool list
tools = [read_file, rag_tool]


# ✅ Load LLM & memory
llm = ChatOllama(model="llama3.2:3b")
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# ✅ Initialize the ReAct agent
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
    handle_parsing_errors=True
)

# ✅ Ask the agent something
response = agent.run("What was the EPS of JPMorgan in Q4 2024?")
print("\n🤖 Final Answer:", response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: What is the net income of citi.txt?
Thought: I need to read the contents of the file and calculate the net income.
Action:
```
{
  "action": "read_file",
  "action_input": "citi.txt"
}
```
[0mC:/Users/Akshaya V/git/CG/citi.txt

Observation: [36;1m[1;3m1
Citi CEO Jane Fraser said, “With
net income of $4.1 billion we
delivered a strong quarter,
marked by continued
momentum, positive operating
leverage and improved returns
in each of our five businesses.
Services recorded its best first
quarter revenue in a decade.
Markets had a good first quarter
with revenue up 12% driven by
strong client activity and
monetization. Banking was up
12% with M&A revenue nearly
double from what it was last
year. Wealth revenues increased
24% with progress across all
three client segments. USPB
was up 2%, driven mainly by
growth in Branded Cards, and
also saw improved returns. We
returned $2.8 billion in capital
to our shareholders inc

In [None]:
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
from langchain_community.chat_models import ChatOllama

# --- Your existing class (MultiPromptQA) should be above this ---

# Step 1: Initialize the QA system
qa = MultiPromptQA(
    file_paths=[
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/citi.txt",
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/jpmc.txt"
    ],
    embedding_model="qwen2.5:0.5b",
    llm_model="qwen2.5:7b"
)

# Step 2: Wrap the ask method as a tool
earnings_qa_tool = Tool(
    name="earnings_qa_tool",
    func=qa.ask,
    description="Useful for answering questions about quarterly earnings from Citi and JPMorgan documents. Input should be a complete question."
)

# Step 3: Set up LLM for the agent
llm = ChatOllama(model="qwen2.5:7b")

# Step 4: Create the ReAct agent with tools
agent = initialize_agent(
    tools=[earnings_qa_tool],
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# Step 5: Ask a question via the agent
response = agent.run("What was JPMorgan's net income in Q4 2024?")
print(response)


  embedding = OllamaEmbeddings(model=self.embedding_model)


In [1]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine
import re

def is_natural_language(text):
    # Must contain at least one sentence (simple heuristic)
    return bool(re.search(r"[A-Za-z]{4,}.*\.", text)) and not is_table_like(text)

def is_table_like(text):
    lines = text.strip().splitlines()
    if len(lines) < 2:
        return False

    table_like = 0
    for line in lines:
        tokens = line.strip().split()
        num_tokens = len(tokens)
        numbers = len([t for t in tokens if re.fullmatch(r"[\d,.%$]+", t)])
        symbols = len([t for t in tokens if re.fullmatch(r"[\d,.%$O/(U)-]+", t)])

        if num_tokens >= 3 and numbers / num_tokens > 0.5:
            table_like += 1
        elif len(re.findall(r"\$\s?\d", line)) > 1:  # multiple dollar values
            table_like += 1
        elif len(re.findall(r"\d{2,},", line)) > 1:
            table_like += 1

    return table_like / len(lines) > 0.4

def extract_text_excluding_tables(pdf_path):
    final_text = []

    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, (LTTextBox, LTTextLine)):
                text = element.get_text().strip()
                if text and is_natural_language(text):
                    final_text.append(text)

    return "\n\n".join(final_text).strip()
clean_text = extract_text_excluding_tables(r"C:\Users\Akshaya V\git\Earnings research\Earnings_agent\jpmc.pdf")
print(clean_text)


JPMorgan Chase & Co.
383 Madison Avenue, New York, NY 10179-0001
NYSE symbol: JPM
www.jpmorganchase.com

JPMORGANCHASE REPORTS FIRST-QUARTER 2025 NET INCOME OF $14.6 BILLION ($5.07 PER SHARE)

Std. RWA3 $1.8T
Cash and marketable securities4 $1.5T
Average loans $1.3T

n Reported revenue of $45.3 billion and managed revenue

n Expense of $23.6 billion; reported overhead ratio of

Services net charge-off rate of 3.58%

Jamie Dimon, Chairman and CEO, commented: “The Firm reported strong 
underlying business and financial results in the first quarter, producing net 
income of $14.6 billion.”

Dimon continued: “In the CIB, Investment Banking fees rose 12% in the first 
quarter, although clients have become more cautious amid an increase in 
market volatility driven by geopolitical and trade-related tensions. Meanwhile, 
we saw increased activity in the Markets business. Markets revenue rose to 
$9.7 billion, an exceptionally strong quarter with record revenue in Equities. In 
CCB, the franch

In [13]:
from langchain.agents import Tool
import pandas as pd
import re
import ast

def extract_bank_metrics(query: str) -> dict:
    """
    Extracts financial metrics from Excel based on a plain-text input string.
    Expected format: {'excel_path': '...', 'sheet_name': '...', 'bank_name': '...', 'quarter_str': '1Q2025', 'metrics': ['NetIncome', 'EPS']}
    """
    try:
        params = ast.literal_eval(query)  # Parse string input into dict
        excel_path = params['excel_path']
        sheet_name = params['sheet_name']
        bank_name = params['bank_name']
        quarter_str = params['quarter_str']
        metrics = params['metrics']
    except Exception as e:
        return {"error": f"Invalid input format. Must be a dict-style string. {str(e)}"}

    def convert_quarter_to_date(q):
        quarter_mapping = {"1Q": "-03-31", "2Q": "-06-30", "3Q": "-09-30", "4Q": "-12-31"}
        match = re.match(r"([1-4]Q)(\d{4})", q.upper())
        if not match:
            raise ValueError(f"Invalid quarter format: {q}")
        q, year = match.groups()
        return f"{year}{quarter_mapping[q]}"

    df = pd.read_excel(excel_path, sheet_name=sheet_name)
    df["Datetime"] = pd.to_datetime(df["Datetime"])
    quarter_date = pd.to_datetime(convert_quarter_to_date(quarter_str))

    result = df[
        (df["CompanyName"].str.upper() == bank_name.upper()) &
        (df["Datetime"] == quarter_date)
    ]

    if result.empty:
        return {"message": "No data found for given inputs."}

    response = {
        "CompanyName": result.iloc[0]["CompanyName"],
        "Quarter": quarter_str,
    }

    for metric in metrics:
        response[metric] = result.iloc[0].get(metric, "Metric not found")

    return response


In [14]:
from langchain.agents import initialize_agent, AgentType, Tool
from langchain_ollama import ChatOllama

llm = ChatOllama(model="llama3.2:3b", temperature=0)

tools = [
    Tool(
        name="extract_bank_metrics",
        func=extract_bank_metrics,
        description="Extracts financial metrics from an Excel file. Input must be a string that represents a dictionary with keys: excel_path, sheet_name, bank_name, quarter_str, and metrics."
    )
]

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

query = "{'excel_path': 'C:/Users/Akshaya V/git/Earnings research/Earnings_agent/50_metrics.xlsx', 'sheet_name': 'Bank_Earnings_Data', 'bank_name': 'JPMORGAN CHASE & CO', 'quarter_str': '1Q2025', 'metrics': ['NetIncome', 'EarningsPerShare']}"
response = agent.run(f"Use extract_bank_metrics with input: {query}")

print(response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: Use extract_bank_metrics with input: {'excel_path': 'C:/Users/Akshaya V/git/Earnings research/Earnings_agent/50_metrics.xlsx', 'sheet_name': 'Bank_Earnings_Data', 'bank_name': 'JPMORGAN CHASE & CO', 'quarter_str': '1Q2025', 'metrics': ['NetIncome', 'EarningsPerShare']}
Thought: I need to call the extract_bank_metrics function with the given input parameters.
Action: extract_bank_metrics
Action Input: {'excel_path': 'C:/Users/Akshaya V/git/Earnings research/Earnings_agent/50_metrics.xlsx', 'sheet_name': 'Bank_Earnings_Data', 'bank_name': 'JPMORGAN CHASE & CO', 'quarter_str': '1Q2025', 'metrics': ['NetIncome', 'EarningsPerShare']}[0m
Observation: [36;1m[1;3m{'CompanyName': 'JPMorgan Chase & Co', 'Quarter': '1Q2025', 'NetIncome': np.float64(14643000000.0), 'EarningsPerShare': np.float64(5.08)}[0m
Thought:[32;1m[1;3mQuestion: What is the output of extract_bank_metrics with input: {'excel_path': 'C:/Users/Akshaya V