## Preliminaries
Initial setup

In [1]:
import http_library
import csv # library to read/write/parse CSV files
from bs4 import BeautifulSoup # web-scraping library

acceptMime = 'text/html'
cikList = []
cikPath = 'cik.txt'


Open the file containing the list of CIK codes, read them in, and turn them into a list with whitespace stripped

In [2]:
cikFileObject = open(cikPath, newline='')
cikRows = cikFileObject.readlines()

for cik in cikRows:
    cikList.append(cik.strip())

In [3]:
print(cikList)

['0001085917', '0000105598', '0000034088']


## Searching for 10-K forms
Create a list of dictionaries for appropriate results

In [4]:
resultsList = []

Create the search URL using one hacked from playing around online

In [5]:
cik = '0001085917'
# this query string selects for 10-K forms, but also retrieves forms whose code start with 10-K
baseUri = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='+cik+'&type=10-K&dateb=&owner=exclude&start=0&count=40&output=atom'
print(baseUri)


https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001085917&type=10-K&dateb=&owner=exclude&start=0&count=40&output=atom


Retrieve the XML document and turn it into 

In [None]:
soup = BeautifulSoup(http_library.httpGet(baseUri,acceptMime)[1],features="html5lib")
# this search string limits results to only category elements with the attribute that's exactly equal to"10-K"
# the select function returns a list of soup objects that can each be searched
for cat in soup.select('category[term="10-K"]'):
    # can't use cat.filing-href because hyphen in tag is interpreted by Python as a minus
    # also, couldn't get .strings to work, so used first child element (the string content of the tag)
    date = cat.find('filing-date').contents[0]
    year = date[:4]
    print(year)
    # create a dictionary of an individual result
    searchResults = {'cik':cik,'year':year,'uri':cat.find('filing-href').contents[0]}
    if year == "2016" or year == "2014":
        # append the dictionary to the list of results
        resultsList.append(searchResults)


The loop is done, now show the results

In [20]:
print(resultsList)

[{'cik': '0000034088', 'year': '2016', 'uri': 'http://www.sec.gov/Archives/edgar/data/34088/000003408816000065/0000034088-16-000065-index.htm'}, {'cik': '0000034088', 'year': '2014', 'uri': 'http://www.sec.gov/Archives/edgar/data/34088/000003408814000012/0000034088-14-000012-index.htm'}]


In [21]:
for hitNumber in range(0,len(resultsList)):
    print(hitNumber)
    soup = BeautifulSoup(http_library.httpGet(resultsList[hitNumber]['uri'],acceptMime)[1],features="html5lib")
    for row in soup.select('tr'):
        is10k = False
        for cell in row.select('td'):
            try:
                testString = cell.contents[0]
                if cell.contents[0] == "10-K":
                    is10k = True
            except:  # handle error caes where the cell doesn't have contents
                pass
        if is10k:
            print('http://www.sec.gov' + row.a.get('href'))


0
http://www.sec.gov/Archives/edgar/data/34088/000003408816000065/xom10k2015.htm
1
http://www.sec.gov/Archives/edgar/data/34088/000003408814000012/xom10k2013.htm
