In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTTextLine
import re
import pandas as pd
import json
from typing import List, Type, Literal 
from pydantic import BaseModel
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
import json
from langchain.vectorstores import FAISS
import os
from glob import glob
from IPython.display import HTML, Markdown 



In [2]:
dic = r'C:\Users\Akshaya V\git\Earnings research\Earnings_agent\PublicReportResearch-main'
full_path = os.path.join(dic, 'docs')
question="Extract the net revenue of citi bank"
llm_model_name="qwen3:4b"
embedding_model="nomic-embed-text"
sec_excel_path=r'C:/Users/Akshaya V/git/Earnings research/Earnings_agent/PublicReportResearch-main/50_metrics.xlsx'
sec_excel=pd.read_excel(r'C:/Users/Akshaya V/git/Earnings research/Earnings_agent/PublicReportResearch-main/50_metrics.xlsx')
allowed_metrics: List[str] = sec_excel.columns[2:].unique().tolist()
allowed_banks: List[str] = sec_excel['CompanyName'].unique().tolist()

**Intent of the question - Exact or vague**

In [None]:
# ---------------------------
# 1. Define the Pydantic Schema
# ---------------------------
class ParsedRequest_intent(BaseModel):
    intent: str                 
 

# ---------------------------
# 2. Function to Use ChatOllama with Pydantic Schema
# ---------------------------
def intent(
    llm_model_name: str,
    user_input: str,
    schema: Type[BaseModel]
    
) -> BaseModel:
    system_prompt = """You are a expert in classifying questions into 2 categories. The 2 categories are exact question , needs_clarification. A question is marked as exact if it has concrete details in 3 categories - company name, quarter & Year , metrics to be analysed. It is needs_clarification if the user uses words like analyse, in detail , research, elaborate or if the user doesn't provide specific metrics or company name or year & quarter to be analysed . give the output in JSON format srtictly . JSON has one key and it is called intent . For example ) {"intent": "exact"}"""
    # Load the Ollama model
    llm = ChatOllama(model=llm_model_name)

    # Create and send the prompt
    messages = [
        SystemMessage(content=system_prompt.strip()),
        HumanMessage(content=user_input.strip())
    ]
    response = llm.invoke(messages)
    return response.content.strip()
    # Validate using Pydantic
    # try:
    #     raw = response.content.strip()
    
    #     # Optional: clean triple backticks if LLM returns markdown
    #     if "```" in raw:
    #         import re
    #         match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
    #         if match:
    #             raw = match.group(1)
    
    #     # Step 1: Convert JSON string to Python dict
    #     parsed_dict = json.loads(raw)
    
    #     # Step 2: Validate and convert into a Pydantic object
    #     parsed_model = schema.model_validate(parsed_dict)
    
    #     # Step 3: Return it (now it has `.model_dump_json()` etc.)
    #     return parsed_model

    # except Exception as e:
    #     raise ValueError(f"Failed to parse model output: {e}\nRaw Output:\n{response.content}")



    

result = intent(
    llm_model_name=llm_model_name,  # Replace with your loaded Ollama model name
    user_input=question,
    schema=ParsedRequest_intent
)
intent1 = result.split("</think>")[-1].strip()
intent=(json.loads(intent1))
#intent=intent["intent"]
print(intent)


**REPHRASING VAGUE QUESTION**

In [None]:
def vague(model, question, allowed_banks, allowed_metrics):
    prompt = f"""
You are an expert in rewriting vague finance-related questions. Your sole task is to **rephrase the user's question** by **expanding it explicitly** along three dimensions:
- Company names (banks)
- Financial metrics
- Quarters and Years

Important:
- DO NOT provide an answer, explanation, or rationale.
- ONLY return the **rewritten question** in plain text.

Defaults (if the user doesn’t specify):
1. Company: Wells Fargo
2. Quarters: 1Q2025, 4Q2024, 3Q2024
3. Metrics: NetIncome, EarningsPerShare, TotalRevenue
4. Add important metrics for senior leadership (e.g., ReturnOnEquity, ROA, CET1Ratio)

Rules:
- Use only official bank names from: {json.dumps(allowed_banks)}
- Convert quarters to format: "1Q2025"
- Use only metrics from: {json.dumps(allowed_metrics)}

Example:
User: Extract the net revenue of citi bank in 2025Q1  
Output: Extract TotalRevenue, NetIncome, EarningsPerShare, and ReturnOnEquity for Citigroup Inc in 1Q2025, 4Q2024, and 3Q2024.

Respond ONLY with the rewritten version of the user’s question.
Here is the question:  {question}
    """

    llm = ChatOllama(model=model)
   # print(prompt)
   
    response = llm.invoke(prompt)
    return response.content.strip()

if (intent!="exact"):
    result = vague(
    model=llm_model_name,  # Replace with your loaded Ollama model name
    question=question,
    allowed_banks=allowed_banks,
    allowed_metrics=allowed_metrics
    )
    question = result.split("</think>")[-1].strip()
    print(question)

**Getting Pydantic schema inputs**

In [None]:
# ---------------------------
# 1. Define the Pydantic Schema
# ---------------------------
class ParsedRequest(BaseModel):
                 
    banks: List[str]            # Must match allowed_banks
    quarters: List[str]         # e.g., "1Q2025", "4Q2024"
    metrics: List[str]          # Must match allowed_metrics


# ---------------------------
# 2. Function to Use ChatOllama with Pydantic Schema
# ---------------------------
def parse_with_chatollama(
        llm_model_name: str,
        user_input: str,
        schema: Type[BaseModel],
        allowed_banks: List[str],
        allowed_metrics: List[str]
    ) -> BaseModel:
    system_prompt = f"""
You are a financial assistant. Your task is to extract structured information from user input and return it in the following JSON format:

{{
  
  "banks": [valid bank names],
  "quarters": ["1Q2025", "4Q2024", "3Q2024"],
  "metrics": [valid metric keys]
}}

Rules:
- Map any abbreviation or alias to official bank names from this list: {json.dumps(allowed_banks)}
- Extract all mentioned quarters in "1Q2025" format. Include the previous 2 quarters for each.
- Extract only metrics listed here: {json.dumps(allowed_metrics)}.
- Output only the JSON structure as shown above, no explanation or markdown.
"""

    # Load the Ollama model
    llm = ChatOllama(model=llm_model_name)

    # Create and send the prompt
    messages = [
        SystemMessage(content=system_prompt.strip()),
        HumanMessage(content=user_input.strip())
    ]
    response = llm.invoke(messages)
    return response.content.strip()

    # Validate using Pydantic
    # try:
    #     raw = response.content.strip()
    
    #     # Optional: clean triple backticks if LLM returns markdown
    #     if "```" in raw:
    #         import re
    #         match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
    #         if match:
    #             raw = match.group(1)
    
    #     # Step 1: Convert JSON string to Python dict
    #     parsed_dict = json.loads(raw)
    
    #     # Step 2: Validate and convert into a Pydantic object
    #     parsed_model = schema.model_validate(parsed_dict)
    
   #     # Step 3: Return it (now it has `.model_dump_json()` etc.)
    #     return parsed_model

    # except Exception as e:
    #     raise ValueError(f"Failed to parse model output: {e}\nRaw Output:\n{response.content}")

    

In [None]:

result = parse_with_chatollama(
    llm_model_name=llm_model_name,  # Replace with your loaded Ollama model name
    user_input=question,
    schema=ParsedRequest,
    allowed_banks=allowed_banks,
    allowed_metrics=allowed_metrics
)
response = result.split("</think>")[-1].strip()
print(response)

# response=(result.model_dump_json())
input_params=(json.loads(response))
print(input_params)

**PDF Reader RAG**

In [None]:
class ParsedRequest(BaseModel):
    intent: str
    banks: List[str]
    quarters: List[str]
    metrics: List[str]

def is_natural_language(text):
    # Must contain at least one sentence (simple heuristic)
    return bool(re.search(r"[A-Za-z]{4,}.*\.", text)) and not is_table_like(text)

def is_table_like(text):
    lines = text.strip().splitlines()
    if len(lines) < 2:
        return False

    table_like = 0
    for line in lines:
        tokens = line.strip().split()
        num_tokens = len(tokens)
        numbers = len([t for t in tokens if re.fullmatch(r"[\d,.%$]+", t)])
        symbols = len([t for t in tokens if re.fullmatch(r"[\d,.%$O/(U)-]+", t)])

        if num_tokens >= 3 and numbers / num_tokens > 0.5:
            table_like += 1
        elif len(re.findall(r"\$\s?\d", line)) > 1:  # multiple dollar values
            table_like += 1
        elif len(re.findall(r"\d{2,},", line)) > 1:
            table_like += 1

    return table_like / len(lines) > 0.4

def extract_text_excluding_tables(pdf_path):
    final_text = []

    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, (LTTextBox, LTTextLine)):
                text = element.get_text().strip()
                if text and is_natural_language(text):
                    final_text.append(text)

    return "\n\n".join(final_text).strip()

def parse_filename_metadata(filename: str):
    name = os.path.splitext(os.path.basename(filename))[0]
    parts = name.split("_")
    bank_map = {
        "jpm": "JP Morgan Chase",
        "boa": "Bank of America",
        "citi": "Citigroup",
        "gs": "Goldman Sachs",
        "ms": "Morgan Stanley",
    }
    bank_code = parts[0].lower()
    quarter = parts[1].upper() if len(parts) > 1 else "UNKNOWN"
    bank = bank_map.get(bank_code, bank_code.upper())
    return bank, quarter
def create_chunks_with_ollama(text: str, metadata: dict = None):
    embedder = OllamaEmbeddings(model="nomic-embed-text")
    chunker = SemanticChunker(embeddings=embedder, min_chunk_size=2000)

    doc = Document(page_content=text, metadata=metadata or {})
    return chunker.split_documents([doc])
def search_chunks(chunks, parsed_query: ParsedRequest, top_k=5):
    embedder = OllamaEmbeddings(model="nomic-embed-text")
    vectorstore = FAISS.from_documents(chunks, embedder)

    query_text = (
        f"Find information about {', '.join(parsed_query.metrics)} "
        f"for banks like {', '.join(parsed_query.banks)} "
        f"during quarters such as {', '.join(parsed_query.quarters)}"
    )

    return vectorstore.similarity_search(query_text, k=top_k)
def rerank_with_chatollama(chunks, parsed_query: ParsedRequest):
    llm = ChatOllama(model="llama3.2:3b")
    context = "\n\n".join([chunk.page_content for chunk in chunks])

    prompt = (
        f"You are a financial analyst assistant.\n\n"
        f"Query:\n{parsed_query.model_dump_json(indent=2)}\n\n"
        f"Extracted text:\n{context}\n\n"
        f"Please summarize the relevant details in a table or paragraph based on the query intent."
    )

    response = llm.invoke([HumanMessage(content=prompt)])

    return response.content
all_chunks = []

for pdf in os.listdir(full_path):
    if not pdf.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(full_path, pdf)
    print(f" Processing: {pdf}")

    bank, quarter = parse_filename_metadata(pdf)
    clean_text = extract_text_excluding_tables(pdf_path)

    # Wrap in custom tags for traceability
    tagged_text = f"<{bank} {quarter}>\n{clean_text}\n</{bank} {quarter}>"

    # Metadata
    metadata = {
        "source": pdf,
        "bank": bank,
        "quarter": quarter
    }

    # Chunk with metadata
    chunks = create_chunks_with_ollama(tagged_text, metadata)
    all_chunks.extend(chunks)

# Print preview
print(f"\nTotal Chunks Created: {len(all_chunks)}")
for i, chunk in enumerate(all_chunks[:3]):
    print(f"\n--- Chunk {i+1} ---")
    print("Metadata:", chunk.metadata)
    print("Content preview:", chunk.page_content[:300], "...\n")

In [None]:
# structured_query = '''{
#     "intent": "in-depth",
#     "banks": ["JP Morgan Chase", "Bank of America"],
#     "quarters": ["1Q2025", "4Q2024", "3Q2024"],
#     "metrics": ["EarningsPerShare", "NetIncome"]
# }'''
print(intent,input_params)
structured_query = {**intent, **input_params}
print(structured_query)
structured_query_json = json.dumps(structured_query, indent=2)
print(structured_query_json)
parsed_query = ParsedRequest.model_validate(json.loads(structured_query_json))

# Search top relevant chunks based on user's intent

# Generate final answer with LLM (LLaMA 3.2)
# final_answer = rerank_with_chatollama(matched_chunks, parsed_query)
# print(final_answer)
matched_chunks = search_chunks(all_chunks, parsed_query, top_k=5)

# Print matched chunks directly
final_ans=""
for i, chunk in enumerate(matched_chunks):
    print(f"\n--- Matched Chunk {i+1} ---")
    # print("Metadata:", chunk.metadata)
    # print("Content:\n", chunk.page_content)
    final_ans+=chunk.page_content
print(final_ans)


**EXCEL EXTRACTION**

In [None]:


def convert_to_quarter(date_str):
    """Convert a date to '1Q2025' style format."""
    date = pd.to_datetime(date_str)
    quarter = (date.month - 1) // 3 + 1
    return f"{quarter}Q{date.year}"

def get_financial_data(excel_path: str, input_data: dict) -> dict:
    # Load Excel
    df = pd.read_excel(excel_path)

    # Clean columns and parse quarter
    df.columns = df.columns.str.strip()
    df['Quarter'] = df['Datetime'].apply(convert_to_quarter)

    # Input filters
    banks = input_data.get("banks", [])
    quarters = input_data.get("quarters", [])
    metrics = input_data.get("metrics", [])

    result = {}

    for bank in banks:
        bank_df = df[df['CompanyName'].str.strip() == bank.strip()]
        if bank_df.empty:
            continue

        bank_result = {}
        for quarter in quarters:
            quarter_df = bank_df[bank_df['Quarter'] == quarter]
            if quarter_df.empty:
                continue

            row = quarter_df.iloc[0]  # Take first match
            quarter_metrics = {}

            for metric in metrics:
                if metric in row:
                    quarter_metrics[metric] = row[metric]
                else:
                    quarter_metrics[metric] = None  # metric missing

            bank_result[quarter] = quarter_metrics

        result[bank] = bank_result

    return result


In [None]:
input_json = structured_query_json
input1=(json.loads(structured_query_json))
print(type(input1))
print(type(input1.get('banks')))
excel_path = sec_excel_path
output = get_financial_data(excel_path, input1)

# Pretty print
table_output=json.dumps(output, indent=2)


In [None]:
table_output=json.dumps(output, indent=2)

**Final Table and Text**

In [None]:


def generate_financial_html_response(llm_model_name: str, parsed_query: dict, extracted_text: str) -> str:
    llm = ChatOllama(model=llm_model_name)

    prompt = f"""
You are a professional financial analyst creating an internal report for Wells Fargo senior leadership.
Your response must contain three clearly formatted parts in HTML TAgs with embedded CSS to make it visually appealing and boardroom-ready.

### Instructions:
1. **Part 1**: Present the input JSON request as a table and add HTML Table tags (no changes in values).
2. **Part 2**: Use the extracted text to create a detailed narrative summary based on the below question asked. Structure this as a qualitative analysis that highlights trends, anomalies, risks, and growth areas.
3. **Part 3**: Ensure the writing style is impressive to senior executives — use formal, insightful, and concise language.

### Formatting Rules:
- Use <table>, <thead>, <tbody>, <tr>, <th>, <td> for tables
- Use <p>, <h2>, <h3> for textual sections
- Use inline CSS to match Wells Fargo branding (deep red: #b31b1b, gold: #ffd700, and professional fonts)
- Make layout visually appealing (padding, borders, alternating row colors, aligned text)
- Output ONLY valid HTML tags (no Markdown or commentary)

---

JSON Request:
```json
{parsed_query}
```

Extracted Report Text:
{extracted_text}

User question :
{question}
"""

    response = llm.invoke([HumanMessage(content=prompt.strip())])
    return response.content

# Example usage
# html_output = generate_financial_html_response("qwen2.5:7b", parsed_query, extracted_text)
# display(HTML(html_output))


In [None]:
# Example parsed_query dict
parsed_query = input1

# Example: Create a string of matched chunk content


# Call function
html_output = generate_detailed_html_report(llm_model_name, parsed_query, final_ans)

# Optionally write to HTML file
display(HTML(html_output))

In [None]:
(html_output.split("</think>")[-1].strip())

In [None]:
html_output