In [None]:
from langchain_ollama import ChatOllama
def intent(
llm=ChatOllama(model="llama3.2:3b") 
llm.invoke("""You are a expert in classifying questions into 2 categories. The 2 categories are exact question , needs_clarification.
A question is marked as exact if it has concrete details in 3 categories - company name, quarter & Year , metrics to be analysed. 
It is needs_clarification if the user uses words like analyse, in detail , research, elaborate or if the user doesn't provide specific metrics or company name or year & quarter to be analysed 


In [22]:
from typing import List, Type, Literal 
from pydantic import BaseModel
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
import json

# ---------------------------
# 1. Define the Pydantic Schema
# ---------------------------
class ParsedRequest(BaseModel):
    intent: str                 # "table", "qualitative", or "in-depth"
    banks: List[str]            # Must match allowed_banks
    quarters: List[str]         # e.g., "1Q2025", "4Q2024"
    metrics: List[str]          # Must match allowed_metrics


# ---------------------------
# 2. Function to Use ChatOllama with Pydantic Schema
# ---------------------------
def parse_with_chatollama(
    llm_model_name: str,
    user_input: str,
    schema: Type[BaseModel],
    allowed_banks: List[str],
    allowed_metrics: List[str]
) -> BaseModel:
    system_prompt = f"""
You are a financial assistant. Your task is to extract structured information from user input and return it in the following JSON format:

{{
  "intent": "table" | "qualitative" | "in-depth",
  "banks": [valid bank names],
  "quarters": ["1Q2025", "4Q2024", "3Q2024"],
  "metrics": [valid metric keys]
}}

Rules:
- Choose "intent" based on context: "table" (structured data), "qualitative" (light summary), "in-depth" (detailed analysis).
- Map any abbreviation or alias to official bank names from this list: {json.dumps(allowed_banks)}
- Extract all mentioned quarters in "1Q2025" format. Include the previous 2 quarters for each.
- Extract only metrics listed here: {json.dumps(allowed_metrics)}.
- Output only the JSON structure as shown above, no explanation or markdown.
"""

    # Load the Ollama model
    llm = ChatOllama(model=llm_model_name)

    # Create and send the prompt
    messages = [
        SystemMessage(content=system_prompt.strip()),
        HumanMessage(content=user_input.strip())
    ]
    response = llm.invoke(messages)

    # Validate using Pydantic
    try:
        raw = response.content.strip()
    
        # Optional: clean triple backticks if LLM returns markdown
        if "```" in raw:
            import re
            match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
            if match:
                raw = match.group(1)
    
        # Step 1: Convert JSON string to Python dict
        parsed_dict = json.loads(raw)
    
        # Step 2: Validate and convert into a Pydantic object
        parsed_model = schema.model_validate(parsed_dict)
    
        # Step 3: Return it (now it has `.model_dump_json()` etc.)
        return parsed_model

    except Exception as e:
        raise ValueError(f"Failed to parse model output: {e}\nRaw Output:\n{response.content}")

    

In [23]:
if __name__ == "__main__":
    allowed_banks = ["JP Morgan Chase", "Bank of America", "Citigroup", "Wells Fargo"]
    allowed_metrics = ["EarningsPerShare", "NetIncome", "TotalRevenue", "ReturnOnEquity"]

    user_input = "Buy me burgers"

    result = parse_with_chatollama(
        llm_model_name="llama3.2:3b",  # Replace with your loaded Ollama model name
        user_input=user_input,
        schema=ParsedRequest,
        allowed_banks=allowed_banks,
        allowed_metrics=allowed_metrics
    )

    print(result.model_dump_json())


{"intent":"in-depth","banks":[],"quarters":["1Q2025","4Q2024","3Q2024"],"metrics":[]}


In [3]:
import os

dic = r'C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main'
full_path = os.path.join(dic, 'docs')

print(full_path)
print(os.listdir(full_path))
print(os.path.join(full_path,os.listdir(full_path)[0]))
for i in os.listdir(full_path):
    print(os.path.join(full_path,i))


C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs
['boa_ppt_2024_q2.pdf', 'boa_ppt_2024_q3.pdf', 'boa_ppt_2024_q4.pdf', 'boa_ppt_2025_q1.pdf', 'boa_result_2025_q1.pdf', 'citi_result_2024_q1.pdf', 'citi_result_2024_q2.pdf', 'citi_result_2024_q3.pdf', 'citi_result_2024_q4.pdf', 'citi_result_2025_q1.pdf']
C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs\boa_ppt_2024_q2.pdf
C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs\boa_ppt_2024_q2.pdf
C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs\boa_ppt_2024_q3.pdf
C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs\boa_ppt_2024_q4.pdf
C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs\boa_ppt_2025_q1.pdf
C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs\boa_result_2025_q1.pdf
C:\Users\Aksha

In [4]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine
import re

def is_natural_language(text):
    # Must contain at least one sentence (simple heuristic)
    return bool(re.search(r"[A-Za-z]{4,}.*\.", text)) and not is_table_like(text)

def is_table_like(text):
    lines = text.strip().splitlines()
    if len(lines) < 2:
        return False

    table_like = 0
    for line in lines:
        tokens = line.strip().split()
        num_tokens = len(tokens)
        numbers = len([t for t in tokens if re.fullmatch(r"[\d,.%$]+", t)])
        symbols = len([t for t in tokens if re.fullmatch(r"[\d,.%$O/(U)-]+", t)])

        if num_tokens >= 3 and numbers / num_tokens > 0.5:
            table_like += 1
        elif len(re.findall(r"\$\s?\d", line)) > 1:  # multiple dollar values
            table_like += 1
        elif len(re.findall(r"\d{2,},", line)) > 1:
            table_like += 1

    return table_like / len(lines) > 0.4

def extract_text_excluding_tables(pdf_path):
    final_text = []

    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, (LTTextBox, LTTextLine)):
                text = element.get_text().strip()
                if text and is_natural_language(text):
                    final_text.append(text)

    return "\n\n".join(final_text).strip()

# for pdf in os.listdir(full_path):
#     print(pdf)
#     text=""
#     text =f"<{pdf}>"
#     clean_text = extract_text_excluding_tables(os.path.join(full_path,pdf))
#     clean_text=text+clean_text
#     text =f"</{pdf}"
#     text=clean_text+text
#     #print(text)


In [5]:
def parse_filename_metadata(filename: str):
    name = os.path.splitext(os.path.basename(filename))[0]
    parts = name.split("_")
    bank_map = {
        "jpm": "JP Morgan Chase",
        "boa": "Bank of America",
        "citi": "Citigroup",
        "gs": "Goldman Sachs",
        "ms": "Morgan Stanley",
    }
    bank_code = parts[0].lower()
    quarter = parts[1].upper() if len(parts) > 1 else "UNKNOWN"
    bank = bank_map.get(bank_code, bank_code.upper())
    return bank, quarter

In [6]:
class ParsedRequest(BaseModel):
    intent: str
    banks: List[str]
    quarters: List[str]
    metrics: List[str]


In [7]:
def create_chunks_with_ollama(text: str, metadata: dict = None):
    embedder = OllamaEmbeddings(model="nomic-embed-text")
    chunker = SemanticChunker(embeddings=embedder, min_chunk_size=2000)

    doc = Document(page_content=text, metadata=metadata or {})
    return chunker.split_documents([doc])


In [8]:
def search_chunks(chunks, parsed_query: ParsedRequest, top_k=5):
    embedder = OllamaEmbeddings(model="nomic-embed-text")
    vectorstore = FAISS.from_documents(chunks, embedder)

    query_text = (
        f"Find information about {', '.join(parsed_query.metrics)} "
        f"for banks like {', '.join(parsed_query.banks)} "
        f"during quarters such as {', '.join(parsed_query.quarters)}"
    )

    return vectorstore.similarity_search(query_text, k=top_k)


In [9]:
def rerank_with_chatollama(chunks, parsed_query: ParsedRequest):
    llm = ChatOllama(model="llama3.2:3b")
    context = "\n\n".join([chunk.page_content for chunk in chunks])

    prompt = (
        f"You are a financial analyst assistant.\n\n"
        f"Query:\n{parsed_query.model_dump_json(indent=2)}\n\n"
        f"Extracted text:\n{context}\n\n"
        f"Please summarize the relevant details in a table or paragraph based on the query intent."
    )

    response = llm([HumanMessage(content=prompt)])
    return response.content


In [10]:
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine
import re
import json
from typing import List
from pydantic import BaseModel
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatOllama
from langchain.schema import HumanMessage
import os
import re
from glob import glob
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine
full_path = r"C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main\docs"

all_chunks = []

for pdf in os.listdir(full_path):
    if not pdf.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(full_path, pdf)
    print(f"📄 Processing: {pdf}")

    bank, quarter = parse_filename_metadata(pdf)
    clean_text = extract_text_excluding_tables(pdf_path)

    # Wrap in custom tags for traceability
    tagged_text = f"<{bank} {quarter}>\n{clean_text}\n</{bank} {quarter}>"

    # Metadata
    metadata = {
        "source": pdf,
        "bank": bank,
        "quarter": quarter
    }

    # Chunk with metadata
    chunks = create_chunks_with_ollama(tagged_text, metadata)
    all_chunks.extend(chunks)

# ✅ Print preview
print(f"\nTotal Chunks Created: {len(all_chunks)}")
for i, chunk in enumerate(all_chunks[:3]):
    print(f"\n--- Chunk {i+1} ---")
    print("Metadata:", chunk.metadata)
    print("Content preview:", chunk.page_content[:300], "...\n")

📄 Processing: boa_ppt_2024_q2.pdf
📄 Processing: boa_ppt_2024_q3.pdf
📄 Processing: boa_ppt_2024_q4.pdf
📄 Processing: boa_ppt_2025_q1.pdf
📄 Processing: boa_result_2025_q1.pdf
📄 Processing: citi_result_2024_q1.pdf


Cannot set gray non-stroke color because /'P43' is an invalid float value
Cannot set gray non-stroke color because /'P45' is an invalid float value
Cannot set gray non-stroke color because /'P46' is an invalid float value
Cannot set gray non-stroke color because /'P61' is an invalid float value
Cannot set gray non-stroke color because /'P62' is an invalid float value
Cannot set gray non-stroke color because /'P66' is an invalid float value
Cannot set gray non-stroke color because /'P67' is an invalid float value
Cannot set gray non-stroke color because /'P71' is an invalid float value
Cannot set gray non-stroke color because /'P72' is an invalid float value
Cannot set gray non-stroke color because /'P78' is an invalid float value
Cannot set gray non-stroke color because /'P79' is an invalid float value
Cannot set gray non-stroke color because /'P83' is an invalid float value
Cannot set gray non-stroke color because /'P84' is an invalid float value
Cannot set gray non-stroke color becau

📄 Processing: citi_result_2024_q2.pdf


Cannot set gray non-stroke color because /'P39' is an invalid float value
Cannot set gray non-stroke color because /'P41' is an invalid float value
Cannot set gray non-stroke color because /'P42' is an invalid float value
Cannot set gray non-stroke color because /'P56' is an invalid float value
Cannot set gray non-stroke color because /'P57' is an invalid float value
Cannot set gray non-stroke color because /'P63' is an invalid float value
Cannot set gray non-stroke color because /'P64' is an invalid float value
Cannot set gray non-stroke color because /'P68' is an invalid float value
Cannot set gray non-stroke color because /'P69' is an invalid float value
Cannot set gray non-stroke color because /'P75' is an invalid float value
Cannot set gray non-stroke color because /'P76' is an invalid float value
Cannot set gray non-stroke color because /'P80' is an invalid float value
Cannot set gray non-stroke color because /'P81' is an invalid float value
Cannot set gray non-stroke color becau

📄 Processing: citi_result_2024_q3.pdf


Cannot set gray non-stroke color because /'P40' is an invalid float value
Cannot set gray non-stroke color because /'P42' is an invalid float value
Cannot set gray non-stroke color because /'P43' is an invalid float value
Cannot set gray non-stroke color because /'P47' is an invalid float value
Cannot set gray non-stroke color because /'P48' is an invalid float value
Cannot set gray non-stroke color because /'P52' is an invalid float value
Cannot set gray non-stroke color because /'P53' is an invalid float value
Cannot set gray non-stroke color because /'P57' is an invalid float value
Cannot set gray non-stroke color because /'P58' is an invalid float value
Cannot set gray non-stroke color because /'P64' is an invalid float value
Cannot set gray non-stroke color because /'P65' is an invalid float value
Cannot set gray non-stroke color because /'P69' is an invalid float value
Cannot set gray non-stroke color because /'P70' is an invalid float value
Cannot set gray non-stroke color becau

📄 Processing: citi_result_2024_q4.pdf


Cannot set gray non-stroke color because /'P45' is an invalid float value
Cannot set gray non-stroke color because /'P47' is an invalid float value
Cannot set gray non-stroke color because /'P48' is an invalid float value
Cannot set gray non-stroke color because /'P49' is an invalid float value
Cannot set gray non-stroke color because /'P53' is an invalid float value
Cannot set gray non-stroke color because /'P54' is an invalid float value
Cannot set gray non-stroke color because /'P55' is an invalid float value
Cannot set gray non-stroke color because /'P60' is an invalid float value
Cannot set gray non-stroke color because /'P61' is an invalid float value
Cannot set gray non-stroke color because /'P62' is an invalid float value
Cannot set gray non-stroke color because /'P67' is an invalid float value
Cannot set gray non-stroke color because /'P68' is an invalid float value
Cannot set gray non-stroke color because /'P70' is an invalid float value
Cannot set gray non-stroke color becau

📄 Processing: citi_result_2025_q1.pdf


Cannot set gray non-stroke color because /'P40' is an invalid float value
Cannot set gray non-stroke color because /'P42' is an invalid float value
Cannot set gray non-stroke color because /'P43' is an invalid float value
Cannot set gray non-stroke color because /'P49' is an invalid float value
Cannot set gray non-stroke color because /'P50' is an invalid float value
Cannot set gray non-stroke color because /'P54' is an invalid float value
Cannot set gray non-stroke color because /'P55' is an invalid float value
Cannot set gray non-stroke color because /'P59' is an invalid float value
Cannot set gray non-stroke color because /'P60' is an invalid float value
Cannot set gray non-stroke color because /'P66' is an invalid float value
Cannot set gray non-stroke color because /'P67' is an invalid float value
Cannot set gray non-stroke color because /'P73' is an invalid float value
Cannot set gray non-stroke color because /'P74' is an invalid float value
Cannot set gray non-stroke color becau


Total Chunks Created: 99

--- Chunk 1 ---
Metadata: {'source': 'boa_ppt_2024_q2.pdf', 'bank': 'Bank of America', 'quarter': 'PPT'}
Content preview: <Bank of America PPT>
Lee McEntire 
Good morning. Welcome. Thank you for joining the call to review our second quarter results. Our earnings 
release documents are available on the Investor Relations section of the bankofamerica.com website, and 
they include the earnings presentation that we will m ...


--- Chunk 2 ---
Metadata: {'source': 'boa_ppt_2024_q2.pdf', 'bank': 'Bank of America', 'quarter': 'PPT'}
Content preview: We're not complacent with the 
success you see on this page. We continue to strategically invest in our core businesses. A few examples. While we have the leading retail deposit share in America, we continue to invest and have 
opened 11 new financial centers this quarter in the first half of the ye ...


--- Chunk 3 ---
Metadata: {'source': 'boa_ppt_2024_q2.pdf', 'bank': 'Bank of America', 'quarter': 'PPT'}
Content pr

In [12]:
structured_query = '''{
    "intent": "in-depth",
    "banks": ["JP Morgan Chase", "Bank of America"],
    "quarters": ["1Q2025", "4Q2024", "3Q2024"],
    "metrics": ["EarningsPerShare", "NetIncome"]
}'''

parsed_query = ParsedRequest.model_validate(json.loads(structured_query))

# Search top relevant chunks based on user's intent
matched_chunks = search_chunks(all_chunks, parsed_query, top_k=5)

# Generate final answer with LLM (LLaMA 3.2)
final_answer = rerank_with_chatollama(matched_chunks, parsed_query)
print(final_answer)

Here is a summary of the relevant details in a paragraph:

Bank of America reported its quarterly earnings, with net income of $1.9 billion and revenue of $6.0 billion. The company saw an increase in non-interest expense by 6% driven by investments in technology and operations. However, the revenue was flat due to gains from leveraged finance positions offsetting lower Net Interest Income (NII). Bank of America also reported its market share and relationships with major corporations, including 78% coverage of the Global Fortune 500 and 95% coverage of the U.S. corporate banking market.

Additionally, the company announced plans to expand its network in key markets, focusing on building out branches in cities like Columbus, where it already has a significant presence. This will allow for more efficient operations and better customer engagement, particularly in terms of digital services.


In [16]:
structured_query = result.model_dump_json()

parsed_query = ParsedRequest.model_validate(json.loads(structured_query))

# Search top relevant chunks based on user's intent
matched_chunks = search_chunks(all_chunks, parsed_query, top_k=5)

# Generate final answer with LLM (LLaMA 3.2)
final_answer = rerank_with_chatollama(matched_chunks, parsed_query)
print(final_answer)

Based on the provided text, here is a summary of the relevant details:

**Bank Performance**

* Bank of America (BofA) reported $6.0 billion in revenue, net of interest expense, for 2Q2025.
* Net income was $1.9 billion.
* The bank's efficiency ratio improved due to increased operating leverage from NII growth.

**Business Highlights**

* BofA maintained its position as the number one investment banking firm and won the "U.S. Corporate Banking & Best Bank Award".
* The bank has relationships with 78% of the Global Fortune 500 and 95% of the U.S. Global Fortune 500.
* BofA's average deposits were $100 million per branch, exceeding expectations due to density and capacity.

**Future Plans**

* The bank aims to expand its presence in top markets across the country to cover the American population efficiently and effectively.
* BofA is focusing on building a network in key locations, such as Columbus, Ohio, rather than expanding rapidly.
* The bank expects to continue growing organically a