In [3]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

In [4]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    # Initialize a dictionary to store the 10-K section
    document = {}
    
    # Extract the 10-K section from the document
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    # Regex to find relevant sections like "Item 1A", "Item 7", "Item 7A", etc.
    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(\d+[A-Za-z]?)[\.]?)')

    
    # Find matches for items in the 10-K content
    matches = regex.finditer(document['10-K'])

    match_data = pd.DataFrame([(x.group(3), x.start(), x.end()) for x in matches])
    match_data.columns = ['item', 'start', 'end']

    # Clean the item names and text
    match_data['item'] = match_data['item'].str.lower()  # Normalize to lowercase
    match_data.replace('&#160;', ' ', regex=True, inplace=True)
    match_data.replace('&nbsp;', ' ', regex=True, inplace=True)
    match_data.replace(' ', '', regex=True, inplace=True)
    match_data.replace('\.', '', regex=True, inplace=True)
    match_data.replace('>', '', regex=True, inplace=True)

    pos_dat = match_data.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    items_content = {}
    for item in pos_dat.index:
        start_pos = pos_dat['start'].loc[item]
        if item == 'item8':  # Handle Item 8's end position manually
            end_pos = len(document['10-K'])
        else:
            next_item = pos_dat.index[pos_dat.index.get_loc(item) + 1] if pos_dat.index.get_loc(item) + 1 < len(pos_dat) else None
            end_pos = pos_dat['start'].loc[next_item] if next_item else len(document['10-K'])
        
        # Extract the raw section for the item
        raw_content = document['10-K'][start_pos:end_pos]

        # Clean the raw HTML content using BeautifulSoup
        soup_content = BeautifulSoup(raw_content, 'lxml')
        items_content[item] = soup_content.get_text("\n\n")

    financial_data = "\n\n".join([f"Item {item.capitalize()}: {content}" for item, content in items_content.items()])

    with open("../10K-TXT/10k.txt", 'w') as f:
        f.write(financial_data)

In [5]:
converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145.txt")