In [1]:
import requests
from itertools import islice
import time
from bs4 import BeautifulSoup

## The below cell:

- Takes a list of Central Index Keys ("CIK"), which are numeric identifiers that identify a particular SEC filing entity.
- Uses the CIK to construct a starting URL that identifies a company's "13F-HR" filings, which contain the data of interest.
     - The first part of the URL, "url_first" has the root of the URL and expects a CIK
     - The CIK identifies the company
     - url_second specifies the "type" of filing as "13F-HR', leaves the dateb and owner params null, and needs to be fed a "start" param.
     - the value "num" in the 2nd for loop supplies the "start=" param
     - url_third supplies the number of table rows to display.  It can be any of [10, 20, 40, 80, 100].  

## Some additional notes:
- I picked 40 as my "count" param rather arbitrarily.
- I set a limit on the "for num in range" for loop of 280 since I expect fewer than 200 total results for any given SEC filer and I didn't want to spam the SEC servers and get blocked.
- I imported the time module and added `time.sleep(1)` to force the requesting part of the loop to wait 1 second between requests for the same reason.

In [2]:
url_first = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='
CIK = ['0001564702', '0001067983']
url_second = '&type=13F-HR%25&dateb=&owner=exclude&start='
url_third = '&count=40'
doclink = []

for cik in CIK:
    for num in range(0,280,40):
        url = url_first+cik+url_second+str(num)+url_third
        time.sleep(1)
        response = requests.get(url, verify=False)
        soup = BeautifulSoup(response.text, 'lxml')
        file_table = soup.find('table', {'class':'tableFile2'})
        rows = file_table.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            for cell in cells[0:1]:
                if cells[0].text == '13F-HR':
                    doclink.append(cells[1].find(href=True).get('href'))



## Once the above cell is run, the output is a very long list of links to go through.  Within each link I will need to pull some values into a data structure.

- <span class="companyName"> contains the name of the filer.  While not strictly needed for the output of the project, this might be useful for later analytics.
- I also want to be able to grab the value known as "Period of Report".  This is available from scraping each of the links in "doclink," but it looks like it will be a pain to grab from there.
- The most important piece to grab from each link is the filing itself.  For filings before June, 2013, the data is only available in a text file.  For filings made June 2013 and after, the data I care about is available in both text and XML.
    - Since the text file is in the same format during both time periods, I will likely write a script that is only capable of parsing the text files.

In [8]:
experiment = doclink[:2]
experiment

['/Archives/edgar/data/1564702/000156470218000008/0001564702-18-000008-index.htm',
 '/Archives/edgar/data/1564702/000156470218000005/0001564702-18-000005-index.htm']

In [13]:
xml_link = []
for link in experiment:
    url = 'https://www.sec.gov' + link
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, 'lxml')
    soup.find_all 
    print (soup)




<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>EDGAR Filing Documents for 0001564702-18-000008</title>
<link href="/include/interactive.css" rel="stylesheet" type="text/css"/>
</head>
<body style="margin: 0">
<!-- SEC Web Analytics - For information please visit: https://www.sec.gov/privacy.htm#collectedinfo -->
<noscript><iframe height="0" src="//www.googletagmanager.com/ns.html?id=GTM-TD3BKV" style="display:none;visibility:hidden" width="0"></iframe></noscript>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-TD3BKV');</scrip



In [7]:
mgr_name = []
year = []
quarter = []
sh_name = []
sh_class = []
cusip = []
val1000 = []
shares = []
sh_prin_type = []
discretion = []
othermgr = []
sole = []
shared = []

In [115]:
file_table = soup.find('table', {'class':'tableFile2'})
rows = file_table.find_all('tr')
link = []
for row in rows:
    cells = row.find_all('td')
    for cell in cells[0:1]:
        if cells[0].text == '13F-HR':
            link.append(cells[1].find(href=True).get('href'))