In [1]:
import os
import re
import PyPDF2
import json

import nltk
nltk.download('punkt')

from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

# Load environment variables from the .envrc file
load_dotenv('../.envrc')

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/whysocurious/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:

# def read_letters(letters_folder):
#     letters = []
#     for filename in os.listdir(letters_folder):
#         if filename.endswith('.txt'):
#             filepath = os.path.join(letters_folder, filename)
#             try:
#                 # Try opening with utf-8 encoding
#                 with open(filepath, 'r', encoding='utf-8') as file:
#                     content = file.read()
#                     letters.append({
#                         'filename': filename,
#                         'content': content
#                     })
#             except UnicodeDecodeError:
#                 # If utf-8 fails, try with ISO-8859-1 encoding (Latin-1)
#                 with open(filepath, 'r', encoding='ISO-8859-1') as file:
#                     content = file.read()
#                     letters.append({
#                         'filename': filename,
#                         'content': content
#                     })
#     return letters

# letters_folder = '../data/letters'
# letters_data = read_letters(letters_folder)

# def read_reports(reports_folder):
#     reports = []
#     for filename in os.listdir(reports_folder):
#         if filename.endswith('.pdf'):
#             filepath = os.path.join(reports_folder, filename)
#             with open(filepath, 'rb') as file:
#                 reader = PyPDF2.PdfReader(file)
#                 text = ''
#                 for page_num in range(len(reader.pages)):
#                     page = reader.pages[page_num]
#                     text += page.extract_text()
#                 reports.append({
#                     'filename': filename,
#                     'content': text
#                 })
#     return reports

# reports_folder = '../data/reports'
# reports_data = read_reports(reports_folder)


def read_papers(papers_folder):
    papers = []
    for filename in os.listdir(papers_folder):
        if filename.endswith('.pdf'):
            # print (filename)
            filepath = os.path.join(papers_folder, filename)
            with open(filepath, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    title = reader.metadata
                    text += page.extract_text()
                papers.append({
                    'filename': filename,
                    'metadata':title,
                    'content': text
                })
    return papers

papers_folder = '../data/papers'
papers_data = read_papers(papers_folder)


In [3]:
def preprocess_text(text):
    # Remove unwanted characters and normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def chunk_text(text, max_tokens=700):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = ''
    for sentence in sentences:
        if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
            current_chunk += ' ' + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


In [4]:
# processed_letters = []
# for letter in letters_data:
#     text = preprocess_text(letter['content'])
#     chunks = chunk_text(text)
#     for i, chunk in enumerate(chunks):
#         processed_letters.append({
#             'source': 'letter',
#             'year': letter['filename'].split(" - ")[1][:4],
#             'chunk_id': f"{letter['filename'].split(' - ')[1][:4]}_chunk_{i}",
#             'content': chunk
#         })

# processed_reports = []
# for report in reports_data:
#     text = preprocess_text(report['content'])
#     chunks = chunk_text(text)
#     for i, chunk in enumerate(chunks):
#         processed_reports.append({
#             'source': 'report',
#             'ticker': report['filename'].split("_")[0],
#             'year': report['filename'].split("_")[1][:4],
#             'chunk_id': f"{report['filename'].split('.')[0]}_chunk_{i}",
#             'content': chunk
#         })


processed_papers = []
for paper in papers_data:
    text = preprocess_text(paper['content'])
    chunks = chunk_text(text)
    for i, chunk in enumerate(chunks):
        processed_papers.append({
            'source': 'paper',
            'chunk_id': f"{paper['filename'].split('.')[0]}_chunk_{i}",
            'content': chunk
        })


In [5]:
# print (len(processed_letters), len(processed_reports))
print ( len(processed_papers))

668


In [6]:
# def generate_metadata(chunk_text):
#     prompt = (
#         "Analyze the following text and provide a brief summary, key topics, and sentiment:\n\n"
#         f"{chunk_text}\n\n"
#         "Summary:"
#     )

#     # Create the chat completion
#     response = client.chat.completions.create(
#         model='gpt-4o-mini',  # Specify the model
#         messages=[
#             # The conversation history, starting with the user's prompt
#             {"role": "user", "content": prompt}
#         ],
#         max_tokens=150,        # Limit the response tokens
#         temperature=0.25,      # Control the randomness
#         n=1,                   # Number of responses to generate
#         stop=None              # When to stop generating tokens
#     )

#     # Extract the assistant's reply
#     metadata_text = response.choices[0].message.content.strip()
#     return metadata_text

# for item in tqdm(processed_letters + processed_reports):
#     metadata = generate_metadata(item['content'])
#     item['metadata'] = metadata


In [7]:
def generate_metadata(chunk_text, source):
    if source == 'letter':
        prompt = (
            "You are analyzing a chunk of text from a Warren Buffett annual shareholder letter. "
            "These letters often include Buffett's investment philosophies, discussions on company performance, reflections on the economy, and advice to investors. "
            "Please analyze the following text and provide a very brief summary in less than 20 words and a maximum of 3-5 key topics."
            "output the results as a dictionary in plain text without any code block formatting or escape characters, the dictionary should contain the following keys: 'summary', 'key_topics'.\n\n"

            f"Text:\n{chunk_text}\n\n"
            
            "Output: summary: <summary>, key_topics: <key_topics>"
        )
    elif source == 'report':
        prompt = (
            "You are analyzing a chunk of text from a company's annual report. "
            "Annual reports typically contain financial statements, management discussions, risk factors, market analysis, and future outlooks. "
            "Please analyze the following text and provide a very brief summary in less than 20 words and a maximum of 3-5 key topics."
            "output the results as a dictionary in plain text without any code block formatting or escape characters, the dictionary should contain the following keys: 'summary', 'key_topics'.\n\n"

            f"Text:\n{chunk_text}\n\n"

            "Output: summary: <summary>, key_topics: <key_topics>"
        )
    elif source == 'paper':
        prompt = (
            "You are an expert financial analyst and researcher specializing in trading strategies and behavioral finance. "
            "Analyze the following excerpt from a research paper and provide a very brief summary in less than 20 words and a maximum of 3-5 key topics."
            "Output the results as a dictionary in plain text without any code block formatting or escape characters, the dictionary should contain the following keys: 'summary', 'key_topics'.\n\n"

            f"Text:\n{chunk_text}\n\n"

            "Output: summary: <summary>, key_topics: <key_topics>"
        )
    else:
        prompt = (
            "Analyze the following text and provide a brief summary, key topics.\n\n"
            f"Text:\n{chunk_text}\n\n"
            "Please provide a list of Summary and Key topics as response."
        )
    response = client.chat.completions.create(
        model='gpt-4o-mini',  # Specify the model
        messages=[
            # The conversation history, starting with the user's prompt
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,        # Limit the response tokens
        temperature=0.5,       # Control the randomness
        n=1,                   # Number of responses to generate
        stop=None              # When to stop generating tokens
    )
    metadata_text = response.choices[0].message.content.strip()
    return metadata_text


In [8]:

for item in tqdm(processed_papers):
    metadata = generate_metadata(item['content'], source='paper')
    item['metadata'] = metadata
    
output_path = '../data/research_papers.json'
with open(output_path, 'w') as json_file:
    json.dump(processed_papers, json_file, indent=4)

100%|██████████| 668/668 [15:03<00:00,  1.35s/it]


In [9]:
processed_papers[0]

{'source': 'paper',
 'chunk_id': 'ssrn-4599565_chunk_0',
 'content': 'DETECTING LEAD-LAGRELATIONSHIPS IN STOCK RETURNS AND PORTFOLIO STRATEGIES∗ Álvaro Cartea†‡Mihai Cucuringu∗§†¶Qi Jin∗‡∥ June 9, 2024 Click here for the most recent version ABSTRACT We propose a method to detect linear and nonlinear lead-lag relationships in stock returns. Our approach uses pairwise Lévy-area and cross-correlation of returns to rank the assets from leaders to followers. We use the rankings to construct a portfolio that longs or shorts the followers based on the previous returns of the leaders, and the stocks are ranked every time the portfolio is rebalanced. The portfolio also takes an offsetting position on the SPY ETF so that the initial value of the portfolio is zero. Our data spans from 1963 to 2022, and we use an average of over 500 stocks to construct portfolios for each trading day. The annualized returns of our lead-lag portfolios are over 20 %, and the returns outperform all lead-lag benchmark

In [None]:

for item in tqdm(processed_letters):
    metadata = generate_metadata(item['content'], source='letter')
    item['metadata'] = metadata
    
output_path = '../data/buffet_letters.json'
with open(output_path, 'w') as json_file:
    json.dump(processed_letters, json_file, indent=4)

100%|██████████| 693/693 [13:41<00:00,  1.18s/it]


In [17]:
tmp = processed_reports

In [16]:
item.keys()

dict_keys(['source', 'ticker', 'year', 'chunk_id', 'content'])

In [19]:
for item in tqdm(processed_reports):

    if 'metadata' in item.keys():
        pass
    else:
        metadata = generate_metadata(item['content'], source='report')
        item['metadata'] = metadata

output_path = '../data/annual_reports.json'
with open(output_path, 'w') as json_file:
    json.dump(processed_reports, json_file, indent=4)

100%|██████████| 932/932 [03:56<00:00,  3.94it/s] 


In [32]:
for item in processed_letters:
    metdt = item['metadata'].split('\n')
    item['summary'] = metdt[0][len("summary:"):].strip()
    item['key_topics'] = [i.strip() for i in metdt[1][len("key_topics:"):].strip().split(',')]
    del item['metadata']
    
for item in processed_reports:
    
    metdt = item['metadata'].split('\n')

    item['summary'] = metdt[0][len("summary:"):].strip()
    item['key_topics'] = [i.strip() for i in metdt[1][len("key_topics:"):].strip().split(',')]
    del item['metadata']

    item['source'] = item['ticker'] + " annual " + item['source']
    del item['ticker']

In [57]:
from elasticsearch import Elasticsearch

# Create an Elasticsearch client instance
es = Elasticsearch(
    [{'scheme': 'http', 'host': 'localhost', 'port': 9200}]
)

index_name = 'enhanced_stock_analyzer'

In [58]:
index_mapping = {
    'mappings': {
        'properties': {
            'source': {'type': 'keyword'},
            'year': {'type': 'keyword'},
            'chunk_id': {'type': 'keyword'},
            'content': {'type': 'text'},
            'summary': {'type': 'text'},
            'key_topics': {'type': 'text'},
            # Add more fields if needed
        }
    }
}

# Create the index
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)


  if not es.indices.exists(index=index_name):


In [59]:
from elasticsearch.helpers import bulk

def generate_actions(data):
    for item in data:
        yield {
            '_index': index_name,
            '_id': item['chunk_id'],
            '_source': {
                'source': item['source'],
                'year': item['year'],
                'chunk_id': item['chunk_id'],
                'content': item['content'],
                'summary': item['summary'],
                'key_topics': item['key_topics'],
            }
        }

# Combine all data
all_data = processed_letters + processed_reports

# Bulk index the data
bulk(es, generate_actions(all_data))

  bulk(es, generate_actions(all_data))


(1625, [])

In [53]:
processed_letters[10]

{'source': 'letter',
 'year': '2018',
 'chunk_id': '2018_chunk_1',
 'content': 'Fornearly three decades, the initial paragraph featured the percentage change in Berkshire’s per-share book value. It’snow time to abandon that practice.The fact is that the annual change in Berkshire’s book value – which makes its farewell appearance on page2 – is a metric that has lost the relevance it once had. Three circumstances have made that so. First, Berkshire hasgradually morphed from a company whose assets are concentrated in marketable stocks into one whose major valueresides in operating businesses. Charlie and I expect that reshaping to continue in an irregular manner. Second, whileourequity holdingsare valued at market prices, accounting rules require our collection of operating companiesto be included in book value at an amount far below their current value, a mismark that has grown in recent years. Third, itis likely that – over time – Berkshire will be a significant repurchaser of its shar

In [54]:
processed_reports[10]

{'source': 'AMARAJABAT annual report',
 'year': '2022',
 'chunk_id': 'AMARAJABAT_2022_chunk_10',
 'content': 'Some of them were: l Connecting conveyors from formation to finishing sections to eliminate movement in the process (ABD Unit 1) l COS unloading automation by ROBOl Launched a Real-time battery traceability system (ABD-I) l Introduced a Paperless system for elimination of Log sheets (ABD-II) l Invested in Battery Readings automation in Formation section (MVRLA) l Commissioned an auto finishing line to improve productivity & reduce fatigue (LVRLA) l Started an auto scheduling system from barcode (ABD-II) l Initiated Multi-layer formation (SBD-1) Improving Productivity The Company carried on with its dedicated and focused energy conservation efforts through upgrading of process technology, effective production scheduling and various energy-saving initiatives including installation of energy efficient equipment. Few initiatives are: l Implemented superior energy- saving practices 

In [60]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

for item in all_data:
    embedding = model.encode(item['content'])
    item['embedding'] = embedding

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [92]:
import psycopg2
import numpy as np

# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname='buffet_wisdom_v1',
    user='whysocurious',
    password='buffet_v1',
    host='localhost',
    port='5432'
)
cursor = conn.cursor()


cursor.execute("DROP TABLE IF EXISTS embeddings")
# Create table
cursor.execute("""
    CREATE TABLE IF NOT EXISTS embeddings (
        chunk_id TEXT PRIMARY KEY,
        source TEXT,
        year TEXT,
        summary TEXT,
        keytopics TEXT[],
        embedding FLOAT[]
    );
""")
conn.commit()


# Insert embeddings
for item in all_data:
    cursor.execute("""
        INSERT INTO embeddings (chunk_id, source, year, summary, keytopics, embedding)
        VALUES (%s, %s, %s, %s, %s, %s)
        ON CONFLICT (chunk_id) DO NOTHING;
    """, (
        item['chunk_id'],
        item['source'],
        item['year'],
        item['summary'],
        item['key_topics'],
        item['embedding'].tolist(),  # Convert numpy array to list
    ))
conn.commit()
conn.close()
