### Config

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ['LANGCHAIN_PROJECT'] = "BuffetBot"
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
FINANCIAL_MODELING_API_KEY = os.getenv("FINANCIAL_MODELING_PREP_API_KEY")

### Knowledge Base (Don't touch)

In [2]:
import os
import numpy as np
from langchain.embeddings import OllamaEmbeddings 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS



pdf_files = ["charlie_munger.pdf", "intelligent_investor.pdf", "buffet_essays.pdf"]

documents = []

for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    documents.extend(loader.load())


NameError: name 'PyPDFLoader' is not defined

In [3]:
from uuid import uuid4

index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query("hello world")))

vector_store = FAISS.load_local(
    "knowledge_base", embeddings, allow_dangerous_deserialization=True
)


uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, id=uuids)


NameError: name 'FAISS' is not defined

In [59]:
vector_store.save_local("knowledge_base")

### Playground

In [1]:
import os
import numpy as np
from langchain.embeddings import OllamaEmbeddings 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [2]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY') ### PLACE YOUR OPENAI API KEY HERE
embeddings = OpenAIEmbeddings()

In [3]:
embeddings = OpenAIEmbeddings()

vector_store = FAISS.load_local(
    "knowledge_base", embeddings, allow_dangerous_deserialization=True
)
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 5})


#### Universal Prompt

In [36]:
from langchain.prompts import ChatPromptTemplate
#------------------------------
# Prompt for Generating Answer from HyDE. Keep {context} and {question} to not break it.
template = """Answer the following question based on this context:

{context}

Question: {question}
"""
#------------------------------

universal_prompt = ChatPromptTemplate.from_template(template)

### Naive RAG

In [37]:

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_core.runnables import RunnablePassthrough


llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
naive_rag_chain = (
    {"context": retriever | format_docs, 
     "question": RunnablePassthrough()} 
    | universal_prompt
    | {"prompt": lambda x: x.to_string(), "response": llm}
    | (lambda x: (x["prompt"], x["response"].content))
)

#### RAG Fusion

In [38]:
from langchain.prompts import ChatPromptTemplate

#------------------------------
# RAG-Fusion Question Generating Prompt
template = """You are a financial assistant that takes questions for Warren Buffet and generates multiple queries, so that Warren understands them properly \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
#------------------------------

prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [39]:

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [40]:
generate_queries.invoke("how to invest like you")

['1. What investment strategies does Warren Buffet recommend for long-term success?',
 '2. How does Warren Buffet evaluate potential investment opportunities?',
 '3. What are the key principles that Warren Buffet follows when making investment decisions?',
 "4. How can individual investors apply Warren Buffet's value investing approach to their own portfolios?"]

In [8]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60, n=5):

    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results[:5]

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion

In [52]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    if isinstance(docs, tuple) and len(docs) == 2:
        # Assuming the second element of the tuple contains the documents
        docs = docs[1]
    
    if isinstance(docs, list):
        formatted_docs = []
        for item in docs:
            if isinstance(item, tuple) and len(item) == 2 and hasattr(item[0], 'page_content'):
                # This is the case for RAG Fusion output
                formatted_docs.append(item[0].page_content)
            elif hasattr(item, 'page_content'):
                # This is the case for regular Document objects
                formatted_docs.append(item.page_content)
            else:
                # Fallback for any other type of item
                formatted_docs.append(str(item))
        return "\n\n".join(formatted_docs)
    else:
        return str(docs)

rag_fusion_chain = (
    {"context": retrieval_chain_rag_fusion | format_docs, 
     "question": RunnablePassthrough()} 
    | universal_prompt
    | {"prompt": lambda x: x.to_string(), "response": llm}
    | (lambda x: (x["prompt"], x["response"].content))
)

#### HyDE

In [10]:
from langchain.prompts import ChatPromptTemplate

#------------------------------
# Prompt for generating HyDE paragraph
template = """You are Warren Buffet. Answer this question with a passage using your principles: {question}
Passage:"""
#------------------------------


prompt_hyde = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_docs_for_retrieval = (
    prompt_hyde | ChatOpenAI(temperature=0) | StrOutputParser() 
)

# Run
question = "how to invest like you?"
generate_docs_for_retrieval.invoke({"question":question})

'Investing like Warren Buffet requires a disciplined approach and a long-term perspective. One of my key principles is to invest in companies with strong competitive advantages and a proven track record of success. I believe in focusing on the fundamentals of a business, rather than trying to time the market or chase short-term gains.\n\nAnother important aspect of investing like me is to do thorough research and analysis before making any investment decisions. I always look for companies with a solid management team, a sustainable business model, and a history of generating consistent profits.\n\nI also believe in the power of patience and staying true to your investment thesis, even when the market is volatile or uncertain. By taking a long-term view and staying focused on the fundamentals, I have been able to achieve significant success in the world of investing.\n\nIn conclusion, to invest like Warren Buffet, it is important to focus on quality companies, do your homework, and have

In [57]:
# Retrieve
retrieval_chain = generate_docs_for_retrieval | retriever 
retireved_docs = retrieval_chain.invoke({"question":question})
retireved_docs

def format_docs(docs):
    if isinstance(docs, tuple) and len(docs) == 2:
        # Assuming the second element of the tuple contains the documents
        docs = docs[1]
    
    if isinstance(docs, list):
        return "\n\n".join(doc.page_content if hasattr(doc, 'page_content') else str(doc) for doc in docs)
    else:
        return str(docs)

hyde_rag_chain = (
    {"context": retrieval_chain | format_docs, 
     "question": RunnablePassthrough()} 
    | universal_prompt
    | {"prompt": lambda x: x.to_string(), "response": llm}
    | (lambda x: (x["prompt"], x["response"].content))
)

### Testing

In [43]:
question = "how do I invest like you?"

In [48]:
naive_rag_chain.invoke(question)

('Human: Answer the following question based on this context:\n\nInstead, recognize that investing intelligently is about controlling\nthe controllable. You can’t control whether the stocks or funds you buy\nwill outperform the market today, next week, this month, or this year; inthe short run, your returns will always be hostage to Mr. Market andhis whims. But you cancontrol:\n•your brokerage costs, by trading rarely, patiently, and cheaply\n•your ownership costs, by refusing to buy mutual funds with\nexcessive annual expenses\n•your expectations, by using realism, not fantasy, to forecast your\nreturns\n7\n•your risk, by deciding how much of your total assets to put at\nhazard in the stock market, by diversifying, and by rebalancing\n•your tax bills, by holding stocks for at least one year and, when-\never possible, for at least five years, to lower your capital-gains lia-\nbility\n• and, most of all, your own behavior.\nIf you listen to financial TV, or read most market columnists, 

In [53]:
rag_fusion_chain.invoke(question)

('Human: Answer the following question based on this context:\n\n(Document(metadata={\'source\': \'warren_buffet_portfolio.pdf\', \'page\': 234}, page_content=\'5. Ibid.\\n6. Ibid.\\n7. Buffett, "Superinvestors."\\n8. Ibid.\\n9. Ibid.\\n  \'), 0.06504494976203068)\n\n(Document(metadata={\'source\': \'warren_buffet_portfolio.pdf\', \'page\': 39}, page_content="Page 31\\nIs It a Good Investment?\\nTo ascertain the probability of achieving a return on your initial \\nstake, Buffett encourages you to keep four primary factors clearly in mind:\\n1. The certainty with which the long-term economic \\ncharacteristics of the business can be evaluated.\\n2. The certainty with which management can be evaluated, both \\nas to its ability to realize the full potential of the business and to wisely employ its cash flows.\\n3. The certainty with which management can be counted on to \\nchannel the rewards from the business to the shareholders rather \\nthan to itself.\\n4. The purchase price of the b

In [54]:
a =hyde_rag_chain.invoke(question)

In [55]:
a[0]



# Your Test Code here

In [56]:
import time
import pandas as pd

# Define all your questions
questions = [
    # Novice
    "As a beginner, should I invest in stocks or bonds? Why?",
    #"What's the first step I should take to start investing?",
    #"How much money do I need to start investing?",
    #"What's the difference between saving and investing?",
    #"Can you explain what a mutual fund is and why it might be good for beginners?",
    # Intermediate
    "How do I evaluate if a company's stock is overvalued or undervalued?",
    #"What's your opinion on diversification versus concentrated investing?",
    #"How important is a company's management when considering an investment?",
    #"Can you explain the concept of 'margin of safety' and why it's important?",
    #"What are some key financial ratios I should look at when analyzing a company?",
    # Advanced
    #"How do you approach intrinsic value calculation for a company with inconsistent earnings?",
    #"What's your view on using derivatives for hedging in a long-term focused portfolio?",
    #"How would you evaluate the competitive advantage of a technology company in a rapidly changing industry?",
    #"Can you discuss the implications of current macroeconomic trends on value investing strategies?",
    #"What's your approach to position sizing in a portfolio, especially for high-conviction investments?",
    # Breadth of Topic
    #"How do you compare investing in real estate versus stocks?",
    #"What role should bonds play in an investment portfolio?",
    # Real-world Scenarios
    #"The stock market has just dropped 20% in a month. What should an investor do?",
    #"You've just inherited $100,000. How would you advise investing it?",
    # Buffett-Specific Knowledge
    #"Can you explain your '20-slot punch card' approach to investing?",
    #"How do you apply the concept of 'economic moat' when evaluating a company?",
    # Open-ended vs. Specific Questions
    #"What do you think are the most important qualities of a successful investor?",
    #"What is the price-to-earnings (P/E) ratio, and how do you use it to evaluate a stock?",
    # Time-sensitive vs. Timeless Questions
    #"How might the current trend towards renewable energy affect traditional oil and gas investments in the next 5-10 years?",
    #"Why is it important for an investor to have a long-term perspective, and how can one cultivate this mindset?"
]

def run_method(method, questions):
    results = []
    for question in questions:
        full_prompt = question
        start_time = time.time()
        prompt, response = method.invoke(full_prompt)  # Updated to use dictionary input
        end_time = time.time()
        response_time = end_time - start_time
        results.append({
            'prompt': prompt,
            'response': response,
            'response_time': response_time
        })
    return results

# Run each method
naive_rag_results = run_method(naive_rag_chain, questions)
rag_fusion_results = run_method(rag_fusion_chain, questions)
hyde_results = run_method(hyde_rag_chain, questions)

# Prepare data for CSV
data = []
for q, nr, rf, h in zip(questions, naive_rag_results, rag_fusion_results, hyde_results):
    data.append({
        'Question': q,
        'Naive RAG Prompt': nr['prompt'],
        'Naive RAG Response': nr['response'],
        'Naive RAG Response Time': nr['response_time'],
        'RAG Fusion Prompt': rf['prompt'],
        'RAG Fusion Response': rf['response'],
        'RAG Fusion Response Time': rf['response_time'],
        'HyDE Prompt': h['prompt'],
        'HyDE Response': h['response'],
        'HyDE Response Time': h['response_time']
    })

# Save to CSV
df = pd.DataFrame(data)
df.to_csv('detailed_method_responses.csv', index=False)

# Print summary
print("Average response times:")
print(f"Naive RAG: {sum(r['response_time'] for r in naive_rag_results) / len(naive_rag_results):.2f} seconds")
print(f"RAG Fusion: {sum(r['response_time'] for r in rag_fusion_results) / len(rag_fusion_results):.2f} seconds")
print(f"HyDE: {sum(r['response_time'] for r in hyde_results) / len(hyde_results):.2f} seconds")


Average response times:
Naive RAG: 17.11 seconds
RAG Fusion: 13.03 seconds
HyDE: 10.52 seconds
