In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import json
import nltk
import matplotlib.pyplot as plt
from bertopic import BERTopic
from hdbscan import HDBSCAN
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nltk.download('punkt')

def clean_text(text):
    # Replace \n with a space
    text = re.sub(r'\n', ' ', text)

    # Replace encoded apostrophes and quotes with plain text equivalents
    text = text.replace('\x92', "'")  # Right single quote
    text = text.replace('\x93', '"')  # Left double quote
    text = text.replace('\x94', '"')  # Right double quote
    text = text.replace('\xa0', ' ')  # Non-breaking space

    # Remove other \x.. sequences
    text = re.sub(r'\\x[0-9A-Fa-f]{2}', '', text)
    
    # Fix spacing issues around periods and capitalization
    text = re.sub(r'\s*\.\s*', '. ', text)
    text = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r'(?<=[.!?])(?=\S)', r' ', text)
    
    return text.strip()

def split_sentences_with_nltk(text):
    # Initial cleaning
    text = clean_text(text)

    # Use nltk's sentence tokenizer
    sentences = re.split(r'(?<=[.,;])\s*', text)

    # Post-process to remove redundant or empty sentences
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # sentences = [sent_tokenize(abstract) for abstract in text]
    return sentences

[nltk_data] Downloading package punkt to /Users/ajaymdn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    document = {}
    
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|3|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|3|7A|7|8))')

    matches = regex.finditer(document['10-K'])

    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    test_df.head()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

    # item_3_raw = document['10-K'][pos_dat['start'].loc['item3']:pos_dat['start'].loc['item7']]
    # item_3_content = BeautifulSoup(item_3_raw, 'lxml')

    # item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
    # item_7_content = BeautifulSoup(item_7_raw, 'lxml')

    # item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]
    # item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')

    item_1a_text = item_1a_content.get_text()
    # item_3_text = item_3_content.get_text()
    # item_7_text = item_7_content.get_text()
    # item_7a_text = item_7a_content.get_text()

    sentences = split_sentences_with_nltk(item_1a_text)
    # sentences += split_sentences_with_nltk(item_3_text)
    # sentences += split_sentences_with_nltk(item_7_text)
    # sentences += split_sentences_with_nltk(item_7a_text)
    
    return sentences

In [4]:
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019317000070/0000320193-17-000070.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019320000096/0000320193-20-000096.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/0000320193-21-000105.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/0000320193-23-000106.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123.txt")

target_year = 2019

with open(f"../Edgar/10K_URL_TECH/All_10K_Filings_{target_year}.json", "r") as read_file:
    companies = json.load(read_file)

sentences = []
for company in companies:
    # if company['company'] == "AAPL":
        try:
            print(f"Company: {company['company']}, Sentence count: {len(sentences)}")
            sentences += converttotext(company['url'])

        except Exception as e:
            print(f"Error processing {company['company']}: {e}")

# NVDA
# test1 = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1045810/000104581022000036/0001045810-22-000036.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/789019/000156459022026876/0001564590-22-026876.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1652044/000165204422000019/0001652044-22-000019.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1018724/000101872422000005/0001018724-22-000005.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1326801/000132680122000018/0001326801-22-000018.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/0001065280-22-000036.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1341439/000156459022023675/0001564590-22-023675.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/723125/000072312522000048/0000723125-22-000048.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1090872/000109087222000026/0001090872-22-000026.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/51143/000155837022001584/0001558370-22-001584.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/47217/000004721722000068/0000047217-22-000068.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1166691/000116669122000009/0001166691-22-000009.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/829224/000082922422000058/0000829224-22-000058.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/64803/000006480322000008/0000064803-22-000008.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/0000950170-22-000796.txt")


# print(test)
# print(sentences)

Company: AAPL, Sentence count: 0
Company: AMZN, Sentence count: 851
Company: NVDA, Sentence count: 1738
Company: MSFT, Sentence count: 2447
Company: ORCL, Sentence count: 2521
Company: CSCO, Sentence count: 3616
Company: META, Sentence count: 4764
Company: INTC, Sentence count: 6696
Company: GOOGL, Sentence count: 6698
Company: NFLX, Sentence count: 6700
Company: T, Sentence count: 7610
Error processing T: Length mismatch: Expected axis has 0 elements, new values have 3 elements
Company: MU, Sentence count: 7610
Company: A, Sentence count: 9044
Company: IBM, Sentence count: 9906
Company: HPQ, Sentence count: 9908
Company: CMCSA, Sentence count: 11275
Company: INTU, Sentence count: 11894
Company: ADTN, Sentence count: 12958
Company: CPT, Sentence count: 14075
Company: CI, Sentence count: 14590
Company: UI, Sentence count: 14636
Company: ONDS, Sentence count: 16369
Company: BR, Sentence count: 17582
Error processing BR: 'item1a'
Company: ARR, Sentence count: 17582
Company: TSLA, Sentence

In [7]:
import torch
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Assuming 'sentences' is your list of text from 10-K files
# Replace with your actual data if different
# sentences = [your_data_here]

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L12-v2").to(device)
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

# UMAP for dimensionality reduction
umap_model = UMAP(
    n_neighbors=15, 
    n_components=15,  # Reduced from 20 to better separate niche topics
    min_dist=0.05, 
    metric='cosine', 
    low_memory=False
)

# HDBSCAN for clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=5,  # Lowered from 10 for niche topics
    min_samples=3,      # Lowered from 5 for less dense clusters
    metric='euclidean', 
    cluster_selection_method='leaf', 
    prediction_data=True
)

# BERTopic model with adjustments
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=5,      # Lowered from 10 for niche topics
    nr_topics=None
)

# Fit the model
topics, probs = topic_model.fit_transform(sentences, embeddings=embeddings)

Using device: cpu


Batches: 100%|██████████| 881/881 [00:52<00:00, 16.93it/s]


In [None]:
topic_info = topic_model.get_topic_info()
topic_info.head(52)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11898,-1_addition_increase_patent_that,"[addition, increase, patent, that, ability, co...",[If we are unable to obtain the rights necessa...
1,0,49,0_result_as_and_,"[result, as, and, , , , , , , ]","[As a result,, As a result,, As a result,]"
2,1,46,1_either_ones_effect_which,"[either, ones, effect, which, negatively, adve...",[either of which could have an adverse effect ...
3,2,44,2_lindes_linde_impact_evaluates,"[lindes, linde, impact, evaluates, employs, co...",[all of which may negatively impact Linde’s fi...
4,3,41,3_stockholders_activism_controlling_stockholder,"[stockholders, activism, controlling, stockhol...","[stockholders,, our stockholders,, our stockho..."
5,4,41,4_returns_funds_fund_benchmarks,"[returns, funds, fund, benchmarks, performance...",[which increased from $50 billion to $250 bill...
6,5,40,5_regulatory_authorities_inspection_regulators,"[regulatory, authorities, inspection, regulato...","[regulatory,, Regulatory,, regulatory,]"
7,6,40,6_gene_therapy_rna_aav,"[gene, therapy, rna, aav, capsid, vectors, nov...",[• delays or failures in developing gene ther...
8,7,39,7_internationally_international_doing_inherent,"[internationally, international, doing, inhere...",[there are risks inherent in doing business in...
9,8,37,8_mineral_resource_mineralized_reserve,"[mineral, resource, mineralized, reserve, caut...",[See “Cautionary Note to United States Investo...


In [11]:
# Get topic information
topic_info = topic_model.get_topic_info()
print("All Topics:")
print(topic_info.head(52))  # Note: head() returns a value, so print it directly

# Define ESG keywords for filtering
esg_keywords = ["environment", "sustainability", "climate", "emissions", 
                "social", "diversity", "community", "labor", 
                "governance", "ethics", "compliance", "board"]

# Function to identify ESG topics
def is_esg_topic(topic_words):
    # topic_words is already a list of strings, so use it directly
    overlap = set(topic_words).intersection(set(esg_keywords))
    return len(overlap) > 0

# Add ESG classification and filter
topic_info['is_esg'] = topic_info['Representation'].apply(is_esg_topic)
esg_topics = topic_info[topic_info['is_esg']]
print("\nESG-Related Topics:")
print(esg_topics[['Topic', 'Count', 'Representation']])

# Visualize top 20 topics (or adjust as needed)
topic_model.visualize_barchart(top_n_topics=20, title="Top 20 Topics by Document Count")

# Optional: Inspect document-topic assignments
doc_info = topic_model.get_document_info(sentences)  # Assign to a variable to use it
print("\nDocument Info Sample:")
print(doc_info.head())

All Topics:
    Topic  Count                                               Name  \
0      -1  11898                   -1_addition_increase_patent_that   
1       0     49                                   0_result_as_and_   
2       1     46                         1_either_ones_effect_which   
3       2     44                    2_lindes_linde_impact_evaluates   
4       3     41    3_stockholders_activism_controlling_stockholder   
5       4     41                    4_returns_funds_fund_benchmarks   
6       5     40     5_regulatory_authorities_inspection_regulators   
7       6     40                             6_gene_therapy_rna_aav   
8       7     39     7_internationally_international_doing_inherent   
9       8     37             8_mineral_resource_mineralized_reserve   
10      9     36                      9_facebook_mobile_web_stories   
11     10     36          10_fluctuate_quarterly_quarter_fluctuated   
12     11     35            11_employees_custodians_employee_atwi