In [None]:
# Libraries
from bs4 import BeautifulSoup
from sec_edgar_downloader import Downloader
import glob
import shutil
import re
import datefinder
import numpy as np

In [None]:
# Initialize SEC Downloader
dl = Downloader()

In [None]:
# Download Most Recent 10-K Filing for Apple
company = 'GOOG'
before_yr = 2021
after_yr = 2000
# dl.get('10-K', company, amount=1)

# Get all 8-K filings for Apple after January 1, 2017 and before March 25, 2017
# Note: after and before strings must be in the form "YYYY-MM-DD"
dl.get("10-K", company, after=f'{after_yr}-01-01', before=f'{before_yr}-01-21')

# Get all 10-K filings for Apple
# dl.get("10-K", "AAPL")

In [None]:
# Identify File Path
company = 'MSFT'
file = glob.glob(f"sec-edgar-filings/{company}/10-K/**/*.html", recursive = False)

In [None]:
# Load 10-K HTML
def load_html(fpath):
    with open(fpath, 'r') as f:
        return f.read()
filing = load_html(file[8])

In [None]:
# Initialize BS Object
soup = BeautifulSoup(filing, 'html.parser')
# Get Text Sections
txt = [s.get_text().lower().strip() for s in soup.find_all('font')]
txt = [s.get_text().lower().strip() for s in soup.find_all('span')] if not txt else txt
txt = [s.replace('\xa0', ' ').strip() for s in txt]
txt = [re.sub('\n', '', s) for s in txt]
# Get Text Sections for Item 7
idx_start = [i for i,s in enumerate(txt) if 'item 7.' in s]
idx_end = [i for i,s in enumerate(txt) if 'item 7a.' in s]
idx_item = np.argmax([e-s for e,s in zip(idx_end,idx_start)])
items = txt[idx_start[idx_item]:idx_end[idx_item]]
# Clean Text Sections
items = [s for s in items if '10-k' not in s]
items = list(filter(None, items))

In [None]:
# Parse & Clean Sections for Item 7
def get_sections(items: list):
    sections = []
    txt = []
    for i, item in enumerate(items):
        if item.endswith('.'):
            txt.append(item)
        elif any(s.isalpha() for s in item)==True:
            txt.append(item)
        elif (item.isnumeric()) & (len(item)==4):
            txt.append(item)
    
        if i < (len(items)-1):
            if (items[i+1][0].isalpha()) & ('.' not in items[i+1]) & (txt[-1].endswith('.')):
                sections.append(' '.join(txt))
                txt = []
    sections = [s for s in sections if s != 'item 7.']
    return [' '.join(s.split()) for s in sections]

sections = get_sections(items)

In [None]:
# Get Full Text Split By Paragraph Indicators
split_txt = '\n\n'.join(sections)

In [None]:
# Examine Item 7
print(f'Length of Item: {len(sections)} Sections')
avg_len = np.mean([len(s.split()) for s in sections])
print(f'Average Section Length: {avg_len:.2f} Tokens')

In [None]:
# Get Date
def get_date(txt: str):
    date = txt.split('fiscal year ended')[1].split()[0:3]
    date[2] = re.sub('\D', '', date[2])
    return ' '.join(date).capitalize()

full_txt = soup.get_text().lower()
date = get_date(full_txt)
print(date)

In [None]:
# Delete Downloaded Directory
shutil.rmtree('sec-edgar-filings')