In [1]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

In [2]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    # Initialize a dictionary to store the 10-K section
    document = {}
    
    # Extract the 10-K section from the document
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    # Regex to find relevant sections like "Item 1A", "Item 7", "Item 7A", etc.
    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})')
    
    # Find matches for items in the 10-K content
    matches = regex.finditer(document['10-K'])

    match_data = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])
    match_data.columns = ['item', 'start', 'end']
    match_data['item'] = match_data.item.str.lower()

    # Get rid of unnesesary charcters from the dataframe
    match_data.replace('&#160;', ' ', regex=True, inplace=True)
    match_data.replace('&nbsp;', ' ', regex=True, inplace=True)
    match_data.replace(' ', '', regex=True, inplace=True)
    match_data.replace('\.', '', regex=True, inplace=True)
    match_data.replace('>', '', regex=True, inplace=True)

    pos_dat = match_data.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
    item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
    item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

    # Use BeautifulSoup to clean the raw text and remove HTML tags
    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
    item_7_content = BeautifulSoup(item_7_raw, 'lxml')
    item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')

    financial_data = "Item 1A: " + item_1a_content.get_text("\n\n") + "\n\n"
    financial_data += "Item 7: " + item_7_content.get_text("\n\n") + "\n\n"
    financial_data += "Item 7A: " + item_7a_content.get_text("\n\n") + "\n\n"

    with open("../10K-TXT/10k.txt", 'w') as f:
        f.write(financial_data)

In [3]:
converttotext("https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145.txt")