In [22]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize

In [27]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    # Initialize a dictionary to store the 10-K section
    document = {}
    
    # Extract the 10-K section from the document
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    # Regex to find relevant sections like "Item 1A", "Item 7", "Item 7A", etc.
    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

    # Find matches for items in the 10-K content
    matches = regex.finditer(document['10-K'])

    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower() # Normalize to lowercase
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    items_content = {}
    for item in pos_dat.index:
        start_pos = pos_dat.loc[item, 'start']
        # Determine end position
        if item == 'item8':  # Handle Item 8's end position manually
            end_pos = len(document['10-K'])
        else:
            # Find the start of the next item
            next_item_index = pos_dat.index.get_loc(item) + 1
            next_item = pos_dat.index[next_item_index] if next_item_index < len(pos_dat) else None
            end_pos = pos_dat.loc[next_item, 'start'] if next_item else len(document['10-K'])
        
        # Extract the raw section for the item
        raw_content = document['10-K'][start_pos:end_pos]

        # Clean the raw HTML content using BeautifulSoup
        soup = BeautifulSoup(raw_content, 'lxml')
        clean_text = soup.get_text(separator="\n").strip()
        items_content[item] = clean_text

    financial_data = "\n\n".join([f"Item {item.upper()}:\n{content}" for item, content in items_content.items()])
    sentence_splitter = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')

    # Split the financial data into sentences
    sentences = sentence_splitter.split(financial_data)

    print(sentences)
    with open("../10K-TXT/10k.txt", 'w') as f:
        f.write(financial_data)

In [28]:
converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145.txt")

['Item ITEM1A:\n>Item 1A.', 'Risk Factors\nThe following discussion of risk factors contains forward-looking statements.', 'These risk factors may be important to understanding other statements in this Form 10-K.', 'The following information should be read in conjunction with Part II, Item\xa07, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item\xa08, “Financial Statements and Supplementary Data” of this Form 10-K.', 'The business, financial condition and operating results of the Company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described below, any one or more of which could, directly or indirectly, cause the Company’s actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results.', 'Any of these factors, in whole or in