In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import json
import nltk
import matplotlib.pyplot as plt
from bertopic import BERTopic
from hdbscan import HDBSCAN
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nltk.download('punkt')

def clean_text(text):
    # Replace \n with a space
    text = re.sub(r'\n', ' ', text)

    # Replace encoded apostrophes and quotes with plain text equivalents
    text = text.replace('\x92', "'")  # Right single quote
    text = text.replace('\x93', '"')  # Left double quote
    text = text.replace('\x94', '"')  # Right double quote
    text = text.replace('\xa0', ' ')  # Non-breaking space

    # Remove other \x.. sequences
    text = re.sub(r'\\x[0-9A-Fa-f]{2}', '', text)
    
    # Fix spacing issues around periods and capitalization
    text = re.sub(r'\s*\.\s*', '. ', text)
    text = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r'(?<=[.!?])(?=\S)', r' ', text)
    
    return text.strip()

def split_sentences_with_nltk(text):
    # Initial cleaning
    text = clean_text(text)

    # Use nltk's sentence tokenizer
    sentences = sent_tokenize(text)

    # Post-process to remove redundant or empty sentences
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # sentences = [sent_tokenize(abstract) for abstract in text]
    return sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ajaym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

# r = requests.get("https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119.txt", headers=headers)

# raw_10k = r.text

# doc_start_pattern = re.compile(r'<DOCUMENT>')
# doc_end_pattern = re.compile(r'</DOCUMENT>')
# type_pattern = re.compile(r'<TYPE>[^\n]+')

# doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
# doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

# doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

# document = {}

# for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
#     if doc_type == '10-K':
#         document[doc_type] = raw_10k[doc_start:doc_end]

# # print(document['10-K'][:500])

# regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|2|3|6|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|2|3|6|7A|7|8))')

# matches = regex.finditer(document['10-K'])

# # for match in matches:
# #     print(match)

# # for match in matches:
# #     print(match.group(), match.start(), match.end())

# test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

# test_df.columns = ['item', 'start', 'end']
# test_df['item'] = test_df.item.str.lower()

# test_df.head()

# test_df.replace('&#160;',' ',regex=True,inplace=True)
# test_df.replace('&nbsp;',' ',regex=True,inplace=True)
# test_df.replace(' ','',regex=True,inplace=True)
# test_df.replace('\.','',regex=True,inplace=True)
# test_df.replace('>','',regex=True,inplace=True)

# # test_df.head()
# # print(test_df)

# pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
# pos_dat.set_index('item', inplace=True)

# # pos_dat

# item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
# item_1b_raw = document['10-K'][pos_dat['start'].loc['item1b']:pos_dat['start'].loc['item2']]
# item_2_raw = document['10-K'][pos_dat['start'].loc['item2']:pos_dat['start'].loc['item3']]
# item_3_raw = document['10-K'][pos_dat['start'].loc['item3']:pos_dat['start'].loc['item6']]
# item_6_raw = document['10-K'][pos_dat['start'].loc['item6']:pos_dat['start'].loc['item7']]
# item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
# item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]
# item_8_raw = document['10-K'][pos_dat['start'].loc['item8']:]

# item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
# item_1b_content = BeautifulSoup(item_1b_raw, 'lxml')
# item_2_content = BeautifulSoup(item_2_raw, 'lxml')
# item_3_content = BeautifulSoup(item_3_raw, 'lxml')
# item_6_content = BeautifulSoup(item_6_raw, 'lxml')
# item_7_content = BeautifulSoup(item_7_raw, 'lxml')
# item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')
# item_8_content = BeautifulSoup(item_8_raw, 'lxml')

# item_1a_text = item_1a_content.get_text()
# item_1b_text = item_1b_content.get_text()
# item_2_text = item_2_content.get_text()
# item_3_text = item_3_content.get_text()
# item_6_text = item_6_content.get_text()
# item_7_text = item_7_content.get_text()
# item_7a_text = item_7a_content.get_text()
# item_8_text = item_8_content.get_text()

# text_1a = clean_text(item_1a_text)
# text_1b = clean_text(item_1b_text)
# text_2 = clean_text(item_2_text)
# text_3 = clean_text(item_3_text)
# text_6 = clean_text(item_6_text)
# text_7 = clean_text(item_7_text)
# text_7a = clean_text(item_7a_text)
# text_8 = clean_text(item_8_text)

# sentences = split_sentences_with_nltk(item_1a_text)
# sentences += split_sentences_with_nltk(item_1b_text)
# sentences += split_sentences_with_nltk(item_2_text)
# sentences += split_sentences_with_nltk(item_3_text)
# sentences += split_sentences_with_nltk(item_6_text)
# sentences += split_sentences_with_nltk(item_7_text)
# sentences += split_sentences_with_nltk(item_7a_text)
# sentences += split_sentences_with_nltk(item_8_text)

In [5]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    document = {}
    
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    # regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')
    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|2|3|6|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|2|3|6|7A|7|8))')

    matches = regex.finditer(document['10-K'])

    # for match in matches:
    #     print(match)

    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    # test_df.head()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    # test_df.head()

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
    item_1b_raw = document['10-K'][pos_dat['start'].loc['item1b']:pos_dat['start'].loc['item2']]
    item_2_raw = document['10-K'][pos_dat['start'].loc['item2']:pos_dat['start'].loc['item3']]
    item_3_raw = document['10-K'][pos_dat['start'].loc['item3']:pos_dat['start'].loc['item6']]
    item_6_raw = document['10-K'][pos_dat['start'].loc['item6']:pos_dat['start'].loc['item7']]
    item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
    item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]
    item_8_raw = document['10-K'][pos_dat['start'].loc['item8']:]

    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
    item_1b_content = BeautifulSoup(item_1b_raw, 'lxml')
    item_2_content = BeautifulSoup(item_2_raw, 'lxml')
    item_3_content = BeautifulSoup(item_3_raw, 'lxml')
    item_6_content = BeautifulSoup(item_6_raw, 'lxml')
    item_7_content = BeautifulSoup(item_7_raw, 'lxml')
    item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')
    item_8_content = BeautifulSoup(item_8_raw, 'lxml')
    
    # item_1a_content.prettify()
    # item_7_content.prettify()
    # item_7a_content.prettify()

    item_1a_text = item_1a_content.get_text()
    item_1b_text = item_1b_content.get_text()
    item_2_text = item_2_content.get_text()
    item_3_text = item_3_content.get_text()
    item_6_text = item_6_content.get_text()
    item_7_text = item_7_content.get_text()
    item_7a_text = item_7a_content.get_text()
    item_8_text = item_8_content.get_text()

    text_1a = clean_text(item_1a_text)
    text_1b = clean_text(item_1b_text)
    text_2 = clean_text(item_2_text)
    text_3 = clean_text(item_3_text)
    text_6 = clean_text(item_6_text)
    text_7 = clean_text(item_7_text)
    text_7a = clean_text(item_7a_text)
    text_8 = clean_text(item_8_text)

    sentences = split_sentences_with_nltk(text_1a)
    # sentences += split_sentences_with_nltk(text_1b)
    # sentences += split_sentences_with_nltk(text_2)
    # sentences += split_sentences_with_nltk(text_3)
    # sentences += split_sentences_with_nltk(text_6)
    sentences += split_sentences_with_nltk(text_7)
    sentences += split_sentences_with_nltk(text_7a)
    # sentences += split_sentences_with_nltk(text_8)
    
    return sentences

In [31]:
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019317000070/0000320193-17-000070.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019319000119/0000320193-19-000119.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019320000096/0000320193-20-000096.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/0000320193-21-000105.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/0000320193-23-000106.txt")
# sentences = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123.txt")

target_year = 2020

with open(f"../Edgar/10K_URL_TECH/All_10K_Filings_{target_year}.json", "r") as read_file:
    companies = json.load(read_file)

sentences = []
for company in companies:
            try:
                sentences += converttotext(company['url'])

            except Exception as e:
                print(f"Error processing {company['company']}: {e}")

# NVDA
# test1 = converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1045810/000104581022000036/0001045810-22-000036.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/789019/000156459022026876/0001564590-22-026876.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1652044/000165204422000019/0001652044-22-000019.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1018724/000101872422000005/0001018724-22-000005.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1326801/000132680122000018/0001326801-22-000018.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/0001065280-22-000036.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1341439/000156459022023675/0001564590-22-023675.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/723125/000072312522000048/0000723125-22-000048.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1090872/000109087222000026/0001090872-22-000026.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/51143/000155837022001584/0001558370-22-001584.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/47217/000004721722000068/0000047217-22-000068.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1166691/000116669122000009/0001166691-22-000009.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/829224/000082922422000058/0000829224-22-000058.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/64803/000006480322000008/0000064803-22-000008.txt")
# test1 += converttotext("https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/0000950170-22-000796.txt")


# print(test)
# print(sentences)

Error processing T: 'item1b'
Error processing HIG: 'item1b'


In [32]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/438 [00:00<?, ?it/s]

In [33]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

umap_model = UMAP(n_neighbors=5, n_components=15, min_dist=0.01, metric='cosine', low_memory=False)
hdbscan_model = HDBSCAN(min_cluster_size=3, min_samples=2, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)
# sentence_model = SentenceTransformer("all-MiniLM-L12-v2")
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, min_topic_size=3, nr_topics=50)
topics, probs = topic_model.fit_transform(sentences, embeddings=embeddings)

In [34]:
topic_model.get_topic_info().head(52)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2839,-1_and_of_the_our,"[and, of, the, our, to, in, or, we, as, for]",[We and others are subject to a variety of law...
1,0,2297,0_or_our_and_may,"[or, our, and, may, to, could, we, business, b...","[Events that adversely affect that trust, incl..."
2,1,1112,1_2019_2018_revenue_expenses,"[2019, 2018, revenue, expenses, operating, cas...",[2019 Compared to 2018 Cost of energy generati...
3,2,919,2_percent_compared_prioryear_2019,"[percent, compared, prioryear, 2019, 2018, 15,...","[1% in 2019 compared to 2018., 5% in 2019 comp..."
4,3,896,3_billion_notes_senior_million,"[billion, notes, senior, million, 2019, due, 2...","[0 billion and $4., 3 billion and 2., 5 billio..."
5,4,822,4_statements_financial_the_accounting,"[statements, financial, the, accounting, of, c...",[See Note 4 Long-term Debt in the accompanying...
6,5,799,5_and_our_services_advertising,"[and, our, services, advertising, products, to...","[These statements include, among other things,..."
7,6,520,6_tax_income_taxes_the,"[tax, income, taxes, the, in, provision, juris...",[Income Taxes Our effective tax rate is subjec...
8,7,453,7_stock_common_class_shares,"[stock, common, class, shares, share, repurcha...","[3 million shares of common stock for $1., 0 m..."
9,8,407,8_interest_credit_debt_securities,"[interest, credit, debt, securities, rate, cas...",[Interest Rate Sensitivity Our exposure to cha...


In [88]:
topic_model.visualize_topics()

In [89]:
topic_model.get_document_info(test)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,0,0_of_the_and_our,"[of, the, and, our, to, in, we, or, for, produ...",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,of - the - and - our - to - in - we - or - for...,0.875034,True
1,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,0,0_of_the_and_our,"[of, the, and, our, to, in, we, or, for, produ...",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,of - the - and - our - to - in - we - or - for...,1.0,True
2,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,0,0_of_the_and_our,"[of, the, and, our, to, in, we, or, for, produ...",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,of - the - and - our - to - in - we - or - for...,1.0,True
3,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,1,1_and_of_our_the,"[and, of, our, the, to, in, or, we, for, may]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
4,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,2,2_and_of_the_to,"[and, of, the, to, in, our, or, we, for, year]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - the - to - in - our - or - we - for...,1.0,True
5,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,2,2_and_of_the_to,"[and, of, the, to, in, our, or, we, for, year]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - the - to - in - our - or - we - for...,1.0,True
6,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,3,3_and_of_our_the,"[and, of, our, the, to, in, or, we, for, on]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
7,ITEM 1A. RISK FACTORSIn evaluating NVIDIA and ...,3,3_and_of_our_the,"[and, of, our, the, to, in, or, we, for, on]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
8,"ITEM 1A. RISK FACTORSIn evaluating NVIDIA, the...",1,1_and_of_our_the,"[and, of, our, the, to, in, or, we, for, may]",[ITEM 1A. RISK FACTORSIn evaluating NVIDIA and...,and - of - our - the - to - in - or - we - for...,1.0,True
9,"ITEM 1A. RISK FACTORSIn evaluating NVIDIA, the...",4,4_and_our_to_of,"[and, our, to, of, or, the, in, we, may, for]","[ITEM 1A. RISK FACTORSIn evaluating NVIDIA, th...",and - our - to - of - or - the - in - we - may...,1.0,True
