In [1]:
import os
import re
import PyPDF2
import json

import nltk
nltk.download('punkt')

from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

# Load environment variables from the .envrc file
load_dotenv('../.envrc')

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/whysocurious/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from elasticsearch import Elasticsearch

# Create an Elasticsearch client instance
es = Elasticsearch(
    [{'scheme': 'http', 'host': 'localhost', 'port': 9200}]
)

index_name = 'enhanced_stock_analyzer'

In [3]:
index_mapping = {
    'mappings': {
        'properties': {
            'source': {'type': 'keyword'},
            'year': {'type': 'keyword'},
            'chunk_id': {'type': 'keyword'},
            'content': {'type': 'text'},
            'summary': {'type': 'text'},
            'key_topics': {'type': 'text'},
            # Add more fields if needed
        }
    }
}

# Create the index
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)


  if not es.indices.exists(index=index_name):


In [4]:
letters_path = '../data/buffet_letters.json'
with open(letters_path, 'r') as json_file:
    processed_letters = json.load(json_file)

report_path = '../data/annual_reports.json'
with open(report_path, 'r') as json_file:
    processed_reports = json.load(json_file)


for item in processed_letters:
    metdt = item['metadata'].split('\n')
    item['summary'] = metdt[0][len("summary:"):].strip()
    item['key_topics'] = [i.strip() for i in metdt[1][len("key_topics:"):].strip().split(',')]
    del item['metadata']
    
for item in processed_reports:
    
    metdt = item['metadata'].split('\n')

    item['summary'] = metdt[0][len("summary:"):].strip()
    item['key_topics'] = [i.strip() for i in metdt[1][len("key_topics:"):].strip().split(',')]
    del item['metadata']

    item['source'] = item['ticker'] + " annual " + item['source']
    del item['ticker']


all_data = processed_letters + processed_reports
len(all_data)

1625

In [5]:
from elasticsearch.helpers import bulk

def generate_actions(data):
    for item in data:
        yield {
            '_index': index_name,
            '_id': item['chunk_id'],
            '_source': {
                'source': item['source'],
                'year': item['year'],
                'chunk_id': item['chunk_id'],
                'content': item['content'],
                'summary': item['summary'],
                'key_topics': item['key_topics'],
            }
        }


# Bulk index the data
bulk(es, generate_actions(all_data))

  bulk(es, generate_actions(all_data))


(1625, [])

In [6]:
index_mapping                   

{'mappings': {'properties': {'source': {'type': 'keyword'},
   'year': {'type': 'keyword'},
   'chunk_id': {'type': 'keyword'},
   'content': {'type': 'text'},
   'summary': {'type': 'text'},
   'key_topics': {'type': 'text'}}}}

In [7]:

def search_documents(query, index=index_name, top_k=5):
    """
    Searches the Elasticsearch index for documents relevant to the query.

    Parameters:
    - query (str): The user's question or query.
    - index (str): The name of the Elasticsearch index.
    - top_k (int): Number of top documents to retrieve.

    Returns:
    - List of dictionaries containing the search results.
    """
    search_body = {
        'size': top_k,
        'query': {
            'multi_match': {
                'query': query,
                'fields': ['content', 'source', 'year', 'summary', 'key_topics'],
                'fuzziness': 'AUTO'
            }
        },
        '_source': ['source', 'year', 'chunk_id', 'content', 'summary', 'key_topics']
    }

    response = es.search(index=index, body=search_body)
    hits = response['hits']['hits']
    results = []
    for hit in hits:
        source = hit['_source']
        results.append({
            'source': source['source'],
            'year': source.get('year', ''),
            'chunk_id': source['chunk_id'],
            'content': source['content'],
            'summary': source.get('summary', ''),
            'key_topics': source.get('key_topics', '')
        })
    return results


In [8]:
query = "Explain Warren Buffett's investment strategy focusing on companies with strong moats." #"What is Warren Buffett's future outlook in terms of investing as per the letters after year 2020 ?"

res = search_documents(query, index=index_name, top_k=10)
res

  response = es.search(index=index, body=search_body)


[{'source': 'MOTHERSON annual report',
  'year': '2023',
  'chunk_id': 'MOTHERSON_2023_chunk_8',
  'content': 'Three pillars of growth Customer trust is reflected in the highest-ever revenues that Motherson recorded in FY23 and its strong order book of approximately USD 70 billion. disclosed in March 2023, which provides visibility for the next 5-6 years. “When customers trust us, they give us more opportunities,” Sehgal says. “And we create these new solutions in three ways. Either we do it organically by ourselves, through a partnership or, if that is not possible, through an acquisition.” Organic growth comes in the form of more orders from OEMs across new car models and variants. It also means customers asking Motherson to enter new product segments. As per projected forecasts, automotive production growth in emerging countries is to be double that of developed markets. Emerging markets account for more than 50% of the group’s total revenues in work hard to ensure the companies are

In [9]:
def build_prompt(query, search_results):
    """
    Builds a prompt for the LLM using the query and search results.

    Parameters:
    - query (str): The user's question or query.
    - search_results (list): List of retrieved documents.
    
    Returns:
    - The formatted prompt string.
    """
    # Instruction to the LLM
    instruction = (
        "You are a financial analyst assistant with deep knowledge of Warren Buffett's investment principles. "
        "Using the provided context, answer the user's question. "
        "Use only the facts from the context when answering the question."
        "If the context is insufficient, let the user know. "
        "Provide clear, concise explanations, and include relevant insights from Warren Buffett's letters or the company's annual reports.\n\n"
    )

    # Build context from search results
    context = ""
    for result in search_results:
        source = result['source']
        year = result.get('year', '')
        content = result['content']
        summary = result['summary']
        key_topics = result['key_topics']
        context += f"Source: {source.capitalize()} ({year})\nContent: {content}\n\n"  #Summary: {summary}\nKey topics: {key_topics}\n


    # Assemble the prompt
    prompt = f"{instruction}Context:\n{context}\nQuestion: {query}\nAnswer:"
    return prompt


In [10]:
print (build_prompt(query, res))

You are a financial analyst assistant with deep knowledge of Warren Buffett's investment principles. Using the provided context, answer the user's question. Use only the facts from the context when answering the question.If the context is insufficient, let the user know. Provide clear, concise explanations, and include relevant insights from Warren Buffett's letters or the company's annual reports.

Context:
Source: Motherson annual report (2023)
Content: Three pillars of growth Customer trust is reflected in the highest-ever revenues that Motherson recorded in FY23 and its strong order book of approximately USD 70 billion. disclosed in March 2023, which provides visibility for the next 5-6 years. “When customers trust us, they give us more opportunities,” Sehgal says. “And we create these new solutions in three ways. Either we do it organically by ourselves, through a partnership or, if that is not possible, through an acquisition.” Organic growth comes in the form of more orders from

In [11]:
def call_llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': prompt}],
        max_tokens=1500,
        temperature=0.5
    )
    answer = response.choices[0].message.content.strip()
    return answer

# Example prompt
prompt = build_prompt(query, res) #"Explain Warren Buffett's investment strategy focusing on companies with strong moats."

# Get the response
response = call_llm(prompt)
print(response)


Warren Buffett's investment strategy emphasizes investing in companies with strong and enduring competitive advantages, often referred to as "moats." Here are the key aspects of this strategy:

1. **Understanding the Business**: Buffett looks for businesses that are simple and easy to understand. This allows for better prediction of future performance and risks. For example, he has invested in companies like Coca-Cola and See's Candy, whose business models are straightforward and have remained relatively unchanged over the years.

2. **Favorable Long-Term Economics**: The company should have favorable long-term economic prospects. This includes the ability to generate consistent and high returns on capital employed (ROCE). For instance, See's Candy, despite operating in an unexciting industry with low growth, has produced extraordinary results due to its durable competitive advantage.

3. **Strong Moat**: A truly great business must have an enduring moat that protects its high returns 

In [12]:
def rag_pipeline(query, use_semantic_search=False):
    """
    Runs the Retrieval-Augmented Generation pipeline for a given query.

    Parameters:
    - query (str): The user's question or query.
    - use_semantic_search (bool): Whether to use semantic search.

    Returns:
    - The final answer from the LLM.
    """
    # Step 1: Retrieve Documents
    if use_semantic_search:
        search_results = semantic_search(query)
    else:
        search_results = search_documents(query)

    # Step 2: Build Prompt
    prompt = build_prompt(query, search_results)

    # Step 3: Call LLM
    try:
        answer = call_llm(prompt)
    except Exception as e:
        answer = f"An error occurred while generating the answer: {e}"

    return answer


print (rag_pipeline(query))

  response = es.search(index=index, body=search_body)


Warren Buffett's investment strategy emphasizes the importance of investing in companies with strong, enduring competitive advantages, often referred to as "moats." Here are the key elements of his strategy:

1. **Understandable Business**: Buffett and his partner Charlie Munger prefer to invest in businesses they understand well. This means they avoid industries prone to rapid and continuous change, as these can preclude investment certainty.

2. **Favorable Long-term Economics**: They look for businesses with favorable long-term economic prospects. This includes companies that can sustain high returns on invested capital over time.

3. **Able and Trustworthy Management**: Buffett values competent and honest management. However, he prefers businesses that do not rely solely on exceptional managers for their success, as the business should be able to thrive even if the current management team changes.

4. **Sensible Price**: The price paid for the business must make sense in relation t

In [16]:
def simulate_queries(questions_data):
    query_results = []
    for item in tqdm(questions_data):
        for question in item['questions']:
            true_chunk_id = item['chunk_id']
            # Retrieve top-N chunks using your RAG system
            retrieved_chunks = search_documents(question)  # Implement this function
            retrieved_chunk_ids = [chunk['chunk_id'] for chunk in retrieved_chunks]
            query_results.append({
                'question': question,
                'true_chunk_id': true_chunk_id,
                'retrieved_chunk_ids': retrieved_chunk_ids
            })
    return query_results

def compute_hit_rate(query_results):
    hits = 0
    total = len(query_results)
    for result in tqdm(query_results):
        if result['true_chunk_id'] in result['retrieved_chunk_ids']:
            hits += 1
    hit_rate = hits / total
    return hit_rate

def compute_mrr(query_results):
    rr_sum = 0
    total = len(query_results)
    for result in tqdm(query_results):
        try:
            rank = result['retrieved_chunk_ids'].index(result['true_chunk_id']) + 1
            rr_sum += 1 / rank
        except ValueError:
            continue  # True chunk not in retrieved results
    mrr = rr_sum / total
    return mrr




questions_path = '../data/questions_data.json'
with open(questions_path, 'r') as json_file:
    questions_data = json.load(json_file)


query_results = simulate_queries(questions_data)
print (compute_hit_rate(query_results), compute_mrr(query_results))


  response = es.search(index=index, body=search_body)
100%|██████████| 1625/1625 [09:17<00:00,  2.92it/s]

0.512 0.32761025641025626



