# Download all 10K Filings of SP500 Firms

In [1]:
import pandas as pd 
from pandas import DataFrame
from requests_html import HTMLSession
import os
import requests

In [6]:
# store request from url in r
session = HTMLSession()
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
r = session.get(url)
# store the table of companies in 'table', since there are two tables in the wiki, get the first one (the one we need)
table = r.html.find('table')[0]
# store all rows, first row is a header so we skip it. In HTML, each row is denoted as 'tr'
rows = table.find('tr')[1:]
#get every company ticker and adds it to a list, we'll use each of these to find 10K filings for the company 
tickers = []
for row in rows:
    ticker = row.find('td')[0].text
    #the SEC url doesn't accept periods, and replaces tickers that contain them with '-', need to replicate this 
    if '.' not in ticker:
        tickers.append(ticker)
    else:
        tickers.append(ticker.replace('.', '-'))

In [3]:
#lets check one
tickers[0]

'MMM'

In [4]:
#good, have them all?
len(tickers)

505

In [5]:
#yup. now lets get all the urls that lead directly to each 10-K
# SEC website form:
#'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=TICKER_GOES_HERE'
import math
def get10K(ticker):
    #specifies 10-K filetype before date 03/01/2020, and changes to only show first 10 rows (only need first anyways)
    sec = session.get('https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='+ ticker + '&type=10-K&dateb=20200301&owner=include&count=10&search_text=')
    #table we need is the third one
    #sometimes the company will have no filings listed with the constraint imposed in the URL, means they have no 10k we can use
    if len(list(sec.html.find('table'))) <= 2:
        return '-1'
    sec_search_table = sec.html.find('table')[2]
    #need most recent 10k filed before 03/01/2020 --> first row besides header
    #sometimes, even if there are no 10k's from the company that fit our constraints, a table will still be listed in the HTML, but will be completely empty
    if sec_search_table.text == '':
        return '-1'
    #top row after header, gives most recent 10-K before march 1st 
    link_to_10K_doc = sec_search_table.find('tr')[1]
    #not all document links are at the same index, need the link that contains the keyword 'Archives'
    document_links = list(link_to_10K_doc.absolute_links)
    for link in document_links:
        if 'Archives' in link:
            filepage_link = link
            break

    form_10K_filepage = session.get(filepage_link)
    #grab first table in new list
    contents_table = form_10K_filepage.html.find('table')[0]
    #second row of table contains link to web based 10k
    row_contains_10k = contents_table.find('tr')[1]
    #first (and only) link in set is the 10k url 
    link = list(row_contains_10k.absolute_links)[0]
    #some of the urls have this keyword which changes the text viewer and inhibits the 10K to be read properly --> remove it 
    if 'ix?doc=/' in link:
            link = link.replace('ix?doc=/', '')
    return link

form10Kurls_rough = []
i = 0
last_print = 0
while i < len(tickers):
    #output progress
    prog = math.trunc(i / len(tickers) * 100)
    if prog in range(10, 100, 10) and i > last_print + 10:
        print('%d percent of urls fetched.' % prog)
        last_print = i
    form10Kurls_rough.append(get10K(tickers[i]))
    i += 1
print('Done!')

10 percent of urls fetched.


KeyboardInterrupt: 

In [None]:
#good, but theres a few array members that aren't links we need to remove (all that are -1)
form10Kurls = []
bad_indicies = []
i = 0
for link in form10Kurls_rough:
    #managed to find this link manually, not actually a 10K
    if link != '-1':
        #most of the links contain this string, it reformats the website into a different kind of document viewer
        #works fine to use in the link to download the files, but messes up our ability to send the 10k text into a string, so remove it from this list to use for that 
        form10Kurls.append(link)
    else:
        bad_indicies.append(i)
    i += 1
#should now have an array full of 10K links, lets see:
c = 0
for link in form10Kurls:
    print(c, link)
    c += 1

0 https://www.sec.gov/Archives/edgar/data/66740/000155837020000581/mmm-20191231x10k62bf35.htm
1 https://www.sec.gov/Archives/edgar/data/1800/000110465920023904/abt-20191231x10k59d41b.htm
2 https://www.sec.gov/Archives/edgar/data/1551152/000155115220000007/abbv-20191231x10k.htm
3 https://www.sec.gov/Archives/edgar/data/815094/000156459019020329/abmd-10k_20190331.htm
4 https://www.sec.gov/Archives/edgar/data/1467373/000146737319000339/acn831201910k.htm
5 https://www.sec.gov/Archives/edgar/data/718877/000071887720000003/atvi-12312019x10xk.htm
6 https://www.sec.gov/Archives/edgar/data/796343/000079634320000013/adbe10kfy19.htm
7 https://www.sec.gov/Archives/edgar/data/2488/000000248820000008/amdform10-kfy2019.htm
8 https://www.sec.gov/Archives/edgar/data/1158449/000115844920000035/aap10k12282019secreport.htm
9 https://www.sec.gov/Archives/edgar/data/874761/000087476120000012/a2019form10-k.htm
10 https://www.sec.gov/Archives/edgar/data/4977/000000497720000044/afl12311910k.htm
11 https://www.

In [None]:
#looks good. how many did we lose?
len(tickers) - len(form10Kurls)

6

In [None]:
#what firms, and at what index?
for i in bad_indicies:
    print('%d\t%s' % (i, tickers[i]))

44	APA
78	BF-B
88	CARR
192	FRC
357	OTIS
474	VTRS


In [None]:
#now to make a list of all the good firms
good_firms = []
i = 0
while i < len(tickers):
    if i not in bad_indicies:
        good_firms.append(tickers[i])
    i += 1
print(len(good_firms))

499


In [None]:
#now to dowload the good links
folder = 'text_files'
os.makedirs(folder, exist_ok=True)

def download(url, path):
    req = requests.get(url, allow_redirects=True, stream=True)
    open(path, 'wb').write(req.content)

#this one makes a path out of a given folder and a filename. it allows us to easily give all the files a unique name, and store them in the same folder declared in the above cell 
def makePath(filename):
        return os.path.join(folder, filename + '.html')

#loops through *rough* file, since we need to continuously increment i to keep 10Ks and tickers together, just skip index if its bad 
i = 0
last_print = 0
while i < len(form10Kurls_rough):
    #output progress
    prog = math.trunc(i / len(tickers) * 100)
    if prog in range(10, 100, 10) and i > last_print + 10:
        print('%d percent of files downloaded.' % prog)
        last_print = i
    #make sure its a firm we want, still use the rough list in order to download them properly, haven't removed the 'ix?doc=/' string
    if i not in bad_indicies:
        download(form10Kurls_rough[i], makePath(tickers[i] + '_10K'))
    i += 1
print('Done!')

10 percent of files downloaded.
20 percent of files downloaded.
30 percent of files downloaded.
40 percent of files downloaded.
50 percent of files downloaded.
60 percent of files downloaded.
70 percent of files downloaded.
80 percent of files downloaded.
90 percent of files downloaded.
Done!


In [None]:
#now to create a dataframe 
os.makedirs('inputs', exist_ok=True)
#use the cleaned form0kurls list now, need the altered urls
d = {'Symbol':good_firms, 'url':form10Kurls}
df = (
    pd.DataFrame(d)
    .to_csv('inputs/sp500_10K_with_url.csv', index=False)
)