In [5]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import TextLoader
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA

# 1. Load and split docs
loader = TextLoader("citi.txt", encoding='utf-8')
docs = loader.load()
embedding = OllamaEmbeddings(model="nomic-embed-text:v1.5")
splitter = SemanticChunker(embedding)
split_docs = splitter.split_documents(docs)

# 2. Vector DB
db = FAISS.from_documents(split_docs, embedding)

# 3. Base LLM
llm = ChatOllama(model="llama3.2:3b")

# 4. Multi-query retriever
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=db.as_retriever(), llm=llm
)

# 5. QA chain with multi-query retriever
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=multi_query_retriever)

# 6. Ask a question
response = qa_chain.run("What is the EPS of citi bank in 2025 Q1?")
print(response)


ValueError: Error raised by inference API HTTP code: 404, {"error":"model \"nomic-embed-text:v1.5\" not found, try pulling it first"}

📄 Text File (citi.txt)
   ↓
🧩 Split into Chunks
   ↓
🔢 Embedded into Vectors
   ↓
🗃️ Stored in FAISS
   ↓
❓ You Ask a Question
   ↓
🔍 Relevant Chunks Retrieved
   ↓
🧠 Sent to LLM (LLaMA 3.2)
   ↓
💬 Final Answer Generated


In [4]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import TextLoader
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA

class MultiPromptQA:
    def __init__(self, file_path: str, embedding_model: str, llm_model: str):
        self.file_path = file_path
        self.embedding_model = embedding_model
        self.llm_model = llm_model

        self.docs = self.load_and_split()
        self.db = self.create_vectorstore()
        self.llm = ChatOllama(model=self.llm_model)
        self.retriever = self.create_multi_query_retriever()
        self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever)

    def load_and_split(self):
        loader = TextLoader(self.file_path, encoding='utf-8')
        docs = loader.load()

        embedding = OllamaEmbeddings(model=self.embedding_model)
        splitter = SemanticChunker(embedding)
        split_docs = splitter.split_documents(docs)

        return split_docs

    def create_vectorstore(self):
        embedding = OllamaEmbeddings(model=self.embedding_model)
        return FAISS.from_documents(self.docs, embedding)

    def create_multi_query_retriever(self):
        return MultiQueryRetriever.from_llm(
            retriever=self.db.as_retriever(),
            llm=self.llm,
            include_original=True
        )

    def ask(self, question: str) -> str:
        return self.qa_chain.run(question)


qa = MultiPromptQA(
    file_path="C:/Users/Akshaya V/git/Earnings research/Earnings_agent/citi.txt",
    embedding_model="qwen2.5:0.5b",
    llm_model="qwen2.5:7b"
)

response = qa.ask("What is the EPS of citi in 2025Q1?")
print(response)


The diluted EPS (Earnings Per Share) for Citigroup Inc. in Q1 2025 was $1.96 per share.


In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import TextLoader
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA

class MultiPromptQA:
    def __init__(self, file_paths: list, embedding_model: str, llm_model: str):
        self.file_paths = file_paths
        self.embedding_model = embedding_model
        self.llm_model = llm_model

        self.docs = self.load_and_split_all()
        self.db = self.create_vectorstore()
        self.llm = ChatOllama(model=self.llm_model)
        self.retriever = self.create_multi_query_retriever()
        self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever)

    def load_and_split_all(self):
        all_split_docs = []
        embedding = OllamaEmbeddings(model=self.embedding_model)
        splitter = SemanticChunker(embedding)

        for path in self.file_paths:
            loader = TextLoader(path, encoding='utf-8')
            docs = loader.load()
            split_docs = splitter.split_documents(docs)
            all_split_docs.extend(split_docs)

        return all_split_docs

    def create_vectorstore(self):
        embedding = OllamaEmbeddings(model=self.embedding_model)
        return FAISS.from_documents(self.docs, embedding)

    def create_multi_query_retriever(self):
        return MultiQueryRetriever.from_llm(
            retriever=self.db.as_retriever(),
            llm=self.llm,
            include_original=True
        )

    def ask(self, question: str) -> str:
        return self.qa_chain.run(question)
qa = MultiPromptQA(
    file_paths=[
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/citi.txt",
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/jpmc.txt"
    ],
    embedding_model="qwen2.5:0.5b",
    llm_model="qwen2.5:7b"
)




In [None]:
response = qa.ask("Compare the EPS of Citi and JPMorgan in 2025Q1.")
print(response)

In [None]:
$$ 
RAG + ReACT 
$$

In [14]:
# Ensure required packages are installed:
# pip install langchain pypdf faiss-cpu

from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain.agents.agent_types import AgentType
from langchain.memory import ConversationBufferMemory
from langchain.tools import tool

# 🔧 MultiPromptQA class definition with PDF support
class MultiPromptQA:
    def __init__(self, file_paths: list, embedding_model: str, llm_model: str):
        self.file_paths = file_paths
        self.embedding_model = embedding_model
        self.llm_model = llm_model

        self.docs = self.load_and_split_all()
        self.db = self.create_vectorstore()
        self.llm = ChatOllama(model=self.llm_model)
        self.retriever = self.create_multi_query_retriever()
        self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever)

    def load_and_split_all(self):
        all_split_docs = []
        embedding = OllamaEmbeddings(model=self.embedding_model)
        splitter = SemanticChunker(embedding)

        for path in self.file_paths:
            loader = PyPDFLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding='utf-8')
            docs = loader.load()
            split_docs = splitter.split_documents(docs)
            all_split_docs.extend(split_docs)

        return all_split_docs

    def create_vectorstore(self):
        embedding = OllamaEmbeddings(model=self.embedding_model)
        return FAISS.from_documents(self.docs, embedding)

    def create_multi_query_retriever(self):
        return MultiQueryRetriever.from_llm(
            retriever=self.db.as_retriever(),
            llm=self.llm,
            include_original=True
        )

    def ask(self, question: str) -> str:
        return self.qa_chain.run(question)


# ✅ Tool 1: Read a text or PDF file
@tool
def read_file(bank_file_name: str) -> str:
    """Reads the contents of a local text or PDF file. Example: 'citi.pdf' or 'jpmc.txt'"""
    try:
        file_path = f"C:/Users/Akshaya V/git/CG/{bank_file_name}"
        if bank_file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            loader = TextLoader(file_path, encoding='utf-8')
        pages = loader.load()
        return "\n\n".join([p.page_content for p in pages])
    except Exception as e:
        return f"Error reading file: {str(e)}"


# ✅ Tool 2: Wrap MultiPromptQA as a Tool
rag_qa = MultiPromptQA(
    file_paths=[
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/citi.pdf",
        "C:/Users/Akshaya V/git/Earnings research/Earnings_agent/jpmc.pdf"
    ],
    embedding_model="qwen2.5:0.5b",
    llm_model="qwen2.5:7b"
)

rag_tool = Tool(
    name="earnings_qa_tool",
    func=rag_qa.ask,
    description="Use this tool to answer questions related to quarterly earnings from Citi and JPMorgan PDF reports."
)

# ✅ Final tool list
tools = [read_file, rag_tool]

# ✅ Load LLM & memory
llm = ChatOllama(model="llama3.2:3b")
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# ✅ Initialize the ReAct agent
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
    handle_parsing_errors=True
)

# ✅ Ask the agent something
response = agent.run("Compare the EPS, net income of citi in Q12025?")
print("\n🤖 Final Answer:", response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: Compare the EPS and net income of Citi in Q12025?
Thought: I can use the earnings_qa_tool to answer this question.
Action:
```
{
  "action": "earnings_qa_tool",
  "action_input": {
    "company": "Citi",
    "quarter": "Q1",
    "year": "2025"
  }
}
```
[0m

ToolException: Too many arguments to single-input tool earnings_qa_tool.
                Consider using StructuredTool instead. Args: ['Citi', 'Q1', '2025']

In [6]:
import os
import requests
from datetime import datetime
from bs4 import BeautifulSoup

# List of top US banks (CIKs are accurate)
top_20_banks = [
    ("JPMorgan Chase & Co.", "0000019617"),
    ("Bank of America Corporation", "0000070858"),
    ("Citigroup Inc.", "0000831001"),
    ("Wells Fargo & Company", "0000072971")
    # Add more as needed
]

search_url = "https://www.sec.gov/cgi-bin/browse-edgar"
headers = {"User-Agent": "Mozilla/5.0 (compatible; SECFetcher/1.0)"}


def download_10k_filings(cik, company_name, years=5):
    current_year = datetime.now().year
    target_dir = f"./sec_filings/{company_name.replace(' ', '_')}"
    os.makedirs(target_dir, exist_ok=True)

    for year in range(current_year - 1, current_year - years - 1, -1):
        print(f"\n🔍 Fetching {company_name} 10-K for {year}...")
        params = {
            "action": "getcompany",
            "CIK": cik,
            "type": "10-K",
            "dateb": f"{year}1231",
            "owner": "exclude",
            "count": "100",
            "output": "atom"
        }

        try:
            resp = requests.get(search_url, params=params, headers=headers)
            if resp.status_code != 200:
                print(f"❌ Failed to fetch data for {year}")
                continue

            soup = BeautifulSoup(resp.content, "lxml-xml")  # <-- FIX: use lxml-xml for XML parsing
            entries = soup.find_all("entry")

            if not entries:
                print(f"⚠️ No entries found for {year}")
                continue

            for entry in entries:
                filing_date = entry.find("filing-date").text
                doc_url = entry.find("filing-href").text.replace("-index.htm", ".txt")

                print(f"⬇️  Downloading 10-K from {filing_date}")
                filing_resp = requests.get(doc_url, headers=headers)
                if filing_resp.status_code == 200:
                    file_path = os.path.join(target_dir, f"{company_name.replace(' ', '_')}_{filing_date}.txt")
                    with open(file_path, "w", encoding="utf-8") as f:
                        f.write(filing_resp.text)
                else:
                    print("⚠️  Could not download 10-K.")
                break  # Only download the most recent 10-K per year

        except Exception as e:
            print(f"❗ Error: {e}")


# Trigger downloads for each bank
for name, cik in top_20_banks:
    download_10k_filings(cik, name)

print("\n✅ Download attempt complete.")



🔍 Fetching JPMorgan Chase & Co. 10-K for 2024...
❗ Error: Couldn't find a tree builder with the features you requested: lxml-xml. Do you need to install a parser library?

🔍 Fetching JPMorgan Chase & Co. 10-K for 2023...
❗ Error: Couldn't find a tree builder with the features you requested: lxml-xml. Do you need to install a parser library?

🔍 Fetching JPMorgan Chase & Co. 10-K for 2022...
❗ Error: Couldn't find a tree builder with the features you requested: lxml-xml. Do you need to install a parser library?

🔍 Fetching JPMorgan Chase & Co. 10-K for 2021...
❗ Error: Couldn't find a tree builder with the features you requested: lxml-xml. Do you need to install a parser library?

🔍 Fetching JPMorgan Chase & Co. 10-K for 2020...
❗ Error: Couldn't find a tree builder with the features you requested: lxml-xml. Do you need to install a parser library?

🔍 Fetching Bank of America Corporation 10-K for 2024...
❗ Error: Couldn't find a tree builder with the features you requested: lxml-xml. Do

In [None]:
# ✅ Fix: Replace Tool with StructuredTool for multi-input handling
from langchain.tools import StructuredTool
from typing import Optional

# 🧠 Fix MultiPromptQA ask method to accept kwargs
class MultiPromptQA:
    def __init__(self, file_paths: list, embedding_model: str, llm_model: str):
        self.file_paths = file_paths
        self.embedding_model = embedding_model
        self.llm_model = llm_model

        self.docs = self.load_and_split_all()
        self.db = self.create_vectorstore()
        self.llm = ChatOllama(model=self.llm_model)
        self.retriever = self.create_multi_query_retriever()
        self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever)

    def load_and_split_all(self):
        all_split_docs = []
        embedding = OllamaEmbeddings(model=self.embedding_model)
        splitter = SemanticChunker(embedding)

        for path in self.file_paths:
            loader = PyPDFLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding='utf-8')
            docs = loader.load()
            split_docs = splitter.split_documents(docs)
            all_split_docs.extend(split_docs)

        return all_split_docs

    def create_vectorstore(self):
        embedding = OllamaEmbeddings(model=self.embedding_model)
        return FAISS.from_documents(self.docs, embedding)

    def create_multi_query_retriever(self):
        return MultiQueryRetriever.from_llm(
            retriever=self.db.as_retriever(),
            llm=self.llm,
            include_original=True
        )

    def ask(self, question: str) -> str:
        return self.qa_chain.run(question)

# ✅ Use StructuredTool to pass multiple inputs properly
rag_tool = StructuredTool.from_function(
    name="earnings_qa_tool",
    func=rag_qa.ask,
    description="Answer questions related to earnings from Citi and JPMorgan PDF reports. Accepts a question as input.",
    args_schema=None  # Only one string input needed: question
)

# 🔧 Re-initialize tools and agent with StructuredTool
tools = [read_file, rag_tool]

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
    handle_parsing_errors=True
)

# ✅ Try again
response = agent.run("Compare the EPS of Citi for Q12025")
print("\n🤖 Final Answer:", response)


In [13]:
# ✅ MCP agent with RAG + MultiQueryRetriever + Dynamic Plotting
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.chains.router import MultiPromptChain, RouterChain
from langchain.chains.llm import LLMChain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.agents import Tool, initialize_agent, AgentType
from langchain.memory import ConversationBufferMemory
import matplotlib.pyplot as plt
import json

# ✅ Load models
llm = ChatOllama(model="llama3.2:3b")
embedding = OllamaEmbeddings(model="qwen2.5:0.5b")

# ✅ Build a MultiQueryRetriever-backed RAG chain
def build_multi_rag_chain(file_path):
    docs = PyPDFLoader(file_path).load()
    chunks = SemanticChunker(embedding).split_documents(docs)
    db = FAISS.from_documents(chunks, embedding)
    retriever = MultiQueryRetriever.from_llm(
        retriever=db.as_retriever(),
        llm=llm,
        include_original=True
    )
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# ✅ Define expert chains with bank-specific prompts
citi_rag = build_multi_rag_chain("C:/Users/Akshaya V/git/Earnings research/Earnings_agent/citi.pdf")
jpmc_rag = build_multi_rag_chain("C:/Users/Akshaya V/git/Earnings research/Earnings_agent/jpmc.pdf")

citi_prompt = PromptTemplate.from_template(
    "You are an expert in Citi earnings reports. Answer this question using Citi data only:\n{input}"
)
jpmc_prompt = PromptTemplate.from_template(
    "You are an expert in JPMorgan earnings reports. Answer this question using JPMorgan data only:\n{input}"
)

citi_chain = LLMChain(prompt=citi_prompt, llm=llm)
jpmc_chain = LLMChain(prompt=jpmc_prompt, llm=llm)

# ✅ Router to pick the right chain based on query
router_prompt = PromptTemplate.from_template(
    "Route this question to either 'citi' or 'jpmc' based on bank mentioned:\n{input}\nRoute:"
)
router_chain = LLMChain(prompt=router_prompt, llm=llm)

# ✅ MCP chain
mcp_chain = MultiPromptChain(
    router_chain=router_chain,

    destination_chains={
        "citi": RetrievalQA.from_chain_type(llm=llm, retriever=citi_rag.retriever),
        "jpmc": RetrievalQA.from_chain_type(llm=llm, retriever=jpmc_rag.retriever),
    },
    default_chain=RetrievalQA.from_chain_type(llm=llm, retriever=citi_rag.retriever),

    verbose=True
)

# ✅ Plot tool with dynamic input

def plot_dynamic_chart(data: dict, plot_type: str = "bar") -> str:
    try:
        banks = list(data.keys())
        metrics = list(data[banks[0]].keys())
        num_metrics = len(metrics)
        x = range(len(banks))

        fig, ax = plt.subplots()
        width = 0.8 / num_metrics

        for i, metric in enumerate(metrics):
            values = [data[bank][metric] for bank in banks]
            pos = [p + i * width for p in x]

            if plot_type == "bar":
                ax.bar(pos, values, width, label=metric)
            elif plot_type == "line":
                ax.plot([p + width * i for p in x], values, label=metric, marker='o')
            elif plot_type == "scatter":
                ax.scatter([p + width * i for p in x], values, label=metric)
            else:
                return f"Unsupported plot type: {plot_type}"

        ax.set_title("Bank Earnings Comparison")
        ax.set_xticks([p + width * (num_metrics - 1) / 2 for p in x])
        ax.set_xticklabels(banks)
        ax.set_ylabel("Values")
        ax.legend()
        plt.tight_layout()

        filename = f"earnings_plot_{plot_type}.png"
        plt.savefig(filename)
        return f"Chart saved as '{filename}'"
    except Exception as e:
        return f"Plotting error: {str(e)}"

# ✅ Tool wrapper for chart generator
def user_plot_tool(input: str) -> str:
    try:
        parsed = json.loads(input)
        return plot_dynamic_chart(parsed["data"], parsed.get("plot_type", "bar"))
    except Exception as e:
        return f"Invalid input for plotting: {str(e)}"

plot_tool = Tool(
    name="Earnings Chart Generator",
    func=user_plot_tool,
    description="Generate charts (bar, line, scatter) comparing earnings metrics across banks. Accepts JSON input with plot_type and data."
)

# ✅ Load tools into a ReAct agent (MCP + plot)
tools = [
    Tool(name="MCP Earnings RAG", func=mcp_chain.run, description="Ask any question about Citi or JPMorgan earnings."),
    plot_tool
]

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
    handle_parsing_errors=True
)

# ✅ Example usage
response = agent.run("What was the net income of JPMorgan in Q1 2025?")
# response = agent.run('{"plot_type": "bar", "data": {"Citi": {"EPS": 1.2}, "JPMorgan": {"EPS": 1.6}}}')


  mcp_chain = MultiPromptChain(


AttributeError: 'LLMChain' object has no attribute 'get'

In [None]:
import os
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.chat_models import ChatOllama
from langchain_community.document_loaders import PyPDFLoader, UnstructuredExcelLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import OllamaEmbeddings
from bs4 import BeautifulSoup

class RAGWithOllamaEmbed:
    def __init__(self, data_dir="./data", ollama_llm_model="qwen2:7b", ollama_embed_model="nomic-embed-text"):
        self.data_dir = data_dir
        self.ollama_llm_model = ollama_llm_model
        self.ollama_embed_model = ollama_embed_model
        self.embeddings = OllamaEmbeddings(model=self.ollama_embed_model)
        self.documents = []
        self.vectorstore = None
        self.qa_chain = None

    def load_documents(self):
        for file in os.listdir(self.data_dir):
            path = os.path.join(self.data_dir, file)
            if file.endswith(".pdf"):
                loader = PyPDFLoader(path)
                self.documents.extend(loader.load())
            elif file.endswith(".xlsx") or file.endswith(".xls"):
                loader = UnstructuredExcelLoader(path)
                self.documents.extend(loader.load())

    def build_vectorstore(self):
        chunker = SemanticChunker(
            self.embeddings,
            breakpoint_threshold_type="percentile",
            breakpoint_threshold_amount=90
        )
        chunks = chunker.split_documents(self.documents)
        self.vectorstore = FAISS.from_documents(chunks, self.embeddings)

    def setup_qa_chain(self):
        retriever = self.vectorstore.as_retriever()
        llm = ChatOllama(model=self.ollama_llm_model)
        self.qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=False)

    def answer_as_html(self, question: str) -> str:
        if not self.qa_chain:
            raise Exception("Call setup_qa_chain() before asking questions.")
        answer = self.qa_chain.run(question)

        # Convert to HTML
        soup = BeautifulSoup("", "html.parser")
        div = soup.new_tag("div")
        p = soup.new_tag("p")
        p.string = answer
        div.append(p)
        soup.append(div)
        return str(soup)

# Example usage
if __name__ == "__main__":
    rag = RAGWithOllamaEmbed(
        data_dir="./data",
        ollama_llm_model="qwen2:7b",
        ollama_embed_model="nomic-embed-text"  # Make sure this is pulled with Ollama
    )
    rag.load_documents()
    rag.build_vectorstore()
    rag.setup_qa_chain()

    question = "List the key financial observations from the reports."
    html = rag.answer_as_html(question)

    with open("response_ollama_embed.html", "w", encoding="utf-8") as f:
        f.write(html)

    print("✅ Response saved to response_ollama_embed.html")
