In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import json
import nltk
import matplotlib.pyplot as plt
from bertopic import BERTopic
from hdbscan import HDBSCAN
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nltk.download('punkt')

def clean_text(text):
    # Replace \n with a space
    text = re.sub(r'\n', ' ', text)

    # Replace encoded apostrophes and quotes with plain text equivalents
    text = text.replace('\x92', "'")  # Right single quote
    text = text.replace('\x93', '"')  # Left double quote
    text = text.replace('\x94', '"')  # Right double quote
    text = text.replace('\xa0', ' ')  # Non-breaking space

    # Remove other \x.. sequences
    text = re.sub(r'\\x[0-9A-Fa-f]{2}', '', text)
    
    # Fix spacing issues around periods and capitalization
    text = re.sub(r'\s*\.\s*', '. ', text)
    text = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r'(?<=[.!?])(?=\S)', r' ', text)
    
    return text.strip()

def split_sentences_with_nltk(text):
    # Initial cleaning
    text = clean_text(text)

    # Use nltk's sentence tokenizer
    sentences = re.split(r'(?<=[.,;])\s*', text)

    # Post-process to remove redundant or empty sentences
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # sentences = [sent_tokenize(abstract) for abstract in text]
    return sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ajaym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    document = {}
    
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|3|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|3|7A|7|8))')

    matches = regex.finditer(document['10-K'])

    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    test_df.head()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

    # item_3_raw = document['10-K'][pos_dat['start'].loc['item3']:pos_dat['start'].loc['item7']]
    # item_3_content = BeautifulSoup(item_3_raw, 'lxml')

    # item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
    # item_7_content = BeautifulSoup(item_7_raw, 'lxml')

    # item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]
    # item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')

    item_1a_text = item_1a_content.get_text()
    # item_3_text = item_3_content.get_text()
    # item_7_text = item_7_content.get_text()
    # item_7a_text = item_7a_content.get_text()

    sentences = split_sentences_with_nltk(item_1a_text)
    # sentences += split_sentences_with_nltk(item_3_text)
    # sentences += split_sentences_with_nltk(item_7_text)
    # sentences += split_sentences_with_nltk(item_7a_text)
    
    return sentences

In [4]:
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019317000070/0000320193-17-000070.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019320000096/0000320193-20-000096.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/0000320193-21-000105.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/0000320193-23-000106.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123.txt")

target_year = 2024

with open(f"../Edgar/10K_URL_TECH/All_10K_Filings_{target_year}.json", "r") as read_file:
    companies = json.load(read_file)

sentences = []
for company in companies:
    if company['company'] == "AAPL":
            try:
                print(f"Company: {company['company']}, Sentence count: {len(sentences)}")
                sentences += converttotext(company['url'])

            except Exception as e:
                print(f"Error processing {company['company']}: {e}")

# NVDA
# test1 = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1045810/000104581022000036/0001045810-22-000036.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/789019/000156459022026876/0001564590-22-026876.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1652044/000165204422000019/0001652044-22-000019.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1018724/000101872422000005/0001018724-22-000005.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1326801/000132680122000018/0001326801-22-000018.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/0001065280-22-000036.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1341439/000156459022023675/0001564590-22-023675.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/723125/000072312522000048/0000723125-22-000048.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1090872/000109087222000026/0001090872-22-000026.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/51143/000155837022001584/0001558370-22-001584.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/47217/000004721722000068/0000047217-22-000068.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1166691/000116669122000009/0001166691-22-000009.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/829224/000082922422000058/0000829224-22-000058.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/64803/000006480322000008/0000064803-22-000008.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/0000950170-22-000796.txt")


# print(test)
# print(sentences)

Company: AAPL, Sentence count: 0


In [5]:
import torch
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [6]:
embedding_model = SentenceTransformer("all-MiniLM-L12-v2").to(device)
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

In [10]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

umap_model = UMAP(n_neighbors=15, n_components=20, min_dist=0.05, metric='cosine', low_memory=False)
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=5, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)
# sentence_model = SentenceTransformer("all-MiniLM-L12-v2")
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=10,
    nr_topics=None
)
topics, probs = topic_model.fit_transform(sentences, embeddings=embeddings)

In [11]:
topic_model.get_topic_info().head(52)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,300,-1_the_to_or_and,"[the, to, or, and, of, companys, company, may,...",[Changes to the Company’s products and service...
1,0,57,0_information_data_security_confidential,"[information, data, security, confidential, ac...",[The Company has implemented systems and proce...
2,1,48,1_investigations_legal_litigation_against,"[investigations, legal, litigation, against, g...",[and certain of these arrangements are current...
3,2,45,2_thirdparty_content_applications_software,"[thirdparty, content, applications, software, ...",[software and digital content supplier relatio...
4,3,45,3_adversely_materially_affect_could,"[adversely, materially, affect, could, busines...",[all of which could materially adversely affec...
5,4,30,4_results_condition_operations_financial,"[results, condition, operations, financial, of...",[results of operations and financial condition...
6,5,27,5_components_prepayments_supply_longterm,"[components, prepayments, supply, longterm, ag...",[vendor non-trade receivables and prepayments ...
7,6,27,6_competition_competitive_competitors_markets,"[competition, competitive, competitors, market...","[competition,, competition,, the Company faces..."
8,7,24,7_foreign_exchange_dollar_currencies,"[foreign, exchange, dollar, currencies, intern...",[a strengthening of foreign currencies relativ...
9,8,24,8_addition_in_result_as,"[addition, in, result, as, aggregate, individu...","[In addition,, In addition,, In addition,]"


In [88]:
topic_model.visualize_topics()

In [89]:
topic_model.get_document_info(test)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,0,0_of_the_and_our,"[of, the, and, our, to, in, we, or, for, produ...",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,of - the - and - our - to - in - we - or - for...,0.875034,True
1,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,0,0_of_the_and_our,"[of, the, and, our, to, in, we, or, for, produ...",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,of - the - and - our - to - in - we - or - for...,1.0,True
2,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,0,0_of_the_and_our,"[of, the, and, our, to, in, we, or, for, produ...",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,of - the - and - our - to - in - we - or - for...,1.0,True
3,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,1,1_and_of_our_the,"[and, of, our, the, to, in, or, we, for, may]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
4,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,2,2_and_of_the_to,"[and, of, the, to, in, our, or, we, for, year]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - the - to - in - our - or - we - for...,1.0,True
5,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,2,2_and_of_the_to,"[and, of, the, to, in, our, or, we, for, year]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - the - to - in - our - or - we - for...,1.0,True
6,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,3,3_and_of_our_the,"[and, of, our, the, to, in, or, we, for, on]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
7,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,3,3_and_of_our_the,"[and, of, our, the, to, in, or, we, for, on]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
8,"ITEM 1A. RISK FACTORSIn evaluating NVIDIA, the...",1,1_and_of_our_the,"[and, of, our, the, to, in, or, we, for, may]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
9,"ITEM 1A. RISK FACTORSIn evaluating NVIDIA, the...",4,4_and_our_to_of,"[and, our, to, of, or, the, in, we, may, for]","[ITEM 1A. RISK FACTORSIn evaluating NVIDIA, th...",and - our - to - of - or - the - in - we - may...,1.0,True
