In [1]:
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd

## The below two code cells:

- Takes a list of Central Index Keys ("CIK"), which are numeric identifiers that identify a particular SEC filing entity.


- Uses the CIK to construct a  URL that identifies a company's "13F-HR" filings, which contain the data of interest.
     
     - The first part of the URL, "url_first" has the root of the URL and expects a CIK
     - The CIK identifies the company
     - url_second specifies the "type" of filing as "13F-HR' and supplies other necessary parameters, such as a count of the number of rows to show.
     

## Some additional notes:
- I originally had a three-part url designed to pull more filings, but it caused errors and ended up being unnecessary since all of the filings of interest for each company appeared on the first page.  This is because prior to 2013, the filings were made in plain .txt files.

    - If I wanted to expand upon this project, I would need to pull in _all_ filings, which would require cutomizing a function to read the text filings as well as some error checking on the first cell below that pulls in all of the "doclink" values.
    
    
- I imported the time module and added `time.sleep()` calls to force the requesting part of the loop to wait between requests so as not to spam the SEC.

In [2]:
url_first = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='
url_second = '&type=13F-HR%25&dateb=&owner=exclude&start=0&count=40'
doclink = []
df = pd.DataFrame(columns=['manager', 'filing_date', 'report_date', 'sh_name', 'sh_class', 'cusip', 'val1000', 'share_count', 
                                 'share_or_prin', 'discretion','sole_vote_amt', 'shared_vote_amt'])

sh_name = []
sh_class = []
cusip = []
val1000 = []
share_count = []
share_or_prin = []
discretion = []
sole_vote_amt = []
shared_vote_amt = []
mgr = []
filing_date = []
report_date = []

CIK = [
    '0001350694', #Bridgewater Assoc's
    '0001167557', #AQR Capital Mgt.
    '0001037389', #Rennaissance Tech's
    '0001179392', #Two Sigma Investments
    '0001048445', #Elliot Mgt. Corp.
    '0001061768', #Baupost Group
    '0000937617', #Davidson Kempner Cap. Mgt.
    '0001165408', #Adage Capital Partners
    '0001067983', #Berkshire Hathaway
    '0001564702', #PDT Partners 
    '0000921669', #Carl Icahn
    '0001167483', #Tiger Global Management 
    '0001665241', #Schonfeld Strategic Advisors
    '0000038777', #Franklin Templeton (Franklin Resources, Inc.)
    '0001318757', #Marshall Wace
    '0001423053', #Citadel
    '0001218199', #King Street Capital
    '0001480532', #York Capital Mgt.
    '0001336528', #Pershing Square
    '0001040273', #Third Point
]
        

                

### verify=False only on  in the "requests" calls below for when I run this at work, since there are custom certificates that mess with SSL.

## The cell below does what I want it to, which is go through and grab the documents link for every filing labled "13F-HR".

- I'm aware that my "for row" loop is probably very inelegant and likely inefficient.

In [3]:
# CIK = ['0001564702', '0001067983', '0001167483']
for cik in CIK:
    url = url_first+cik+url_second
    time.sleep(.33)
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, 'lxml')
    mgr_name = str.strip(soup.find('span', {'class': 'companyName'}).text.split('CIK')[0])
    file_table = soup.find('table', {'class':'tableFile2'})
    rows = file_table.find_all('tr')

    for row in rows:
        cells = row.find_all('td')
        for cell in cells[0:1]:
            if cells[0].text == '13F-HR':
                doclink.append(cells[1].find(href=True).get('href'))
                    



## The next two cells below loop through every "doclink" grabbed in the cell above and do the following:

- Populates the filing date in a list ((i.e., if its a Q1 filing, "05-15-2018")
- Populates the "period of report" (i.e., if its a Q1 filing, "03-30-2018")
- Populates the name of the company making the filing
- Grabs all of the "infotable.xml" links, parse them, and store the data in lists
- Assigns the lists to columns in a dataframe

In [5]:
# info_table_link = []  Maybe store the links here and loop through them along with the below?
# primary_doc_link = [] Maybe store the xml link to the "primary doc xml" file and use it pull manager, filing date, and 
# period of report?

for link in doclink:
    url = 'https://www.sec.gov' + link
    time.sleep(.45)
    response = requests.get(url, verify=False)
    filing_soup = BeautifulSoup(response.text, 'lxml')

    mgr_name = str.strip(filing_soup.find('span', {'class': 'companyName'}).text.split('(')[0])
    
    form_grouping = filing_soup.find_all('div', {'class': 'info'})
    report_dt = form_grouping[3].text
    filing_dt = form_grouping[4].text

    cells = filing_soup.find_all('td')
    for cell in cells:

#             if cell.text == 'primary_doc.xml':
#                 response = requests.get('https://www.sec.gov' + cell.find(href=True).get('href'),verify=False)
#                 fsoup = BeautifulSoup(response.text, "lxml")
#                 report_dt= fsoup.signaturedate.text
#                 filing_dt= fsoup.periodofreport.text

        if (cell.text == 'form13fInfoTable.xml') | (cell.text == 'infotable.xml'):
            response = requests.get('https://www.sec.gov' + cell.find(href=True).get('href'), verify=False)
            soup = BeautifulSoup(response.text, "lxml")
            record = soup.find_all('infotable')
            for rec in record:
                mgr.append(mgr_name)
                report_date.append(report_dt)
                filing_date.append(filing_dt)
                sh_name.append(rec.find('nameofissuer').text)
                sh_class.append(rec.find('titleofclass').text)
                cusip.append(rec.find('cusip').text)
                val1000.append(rec.find('value').text)
                share_count.append(rec.find('sshprnamt').text)
                share_or_prin.append(rec.find('sshprnamttype').text)
                discretion.append(rec.find('investmentdiscretion').text)
                sole_vote_amt.append(rec.find('sole').text)
                shared_vote_amt.append(rec.find('shared').text)













































































In [94]:
#I accidentally ran this cell after opening the workbook a second time--obviously it works since I have a dataframe with 
#about 400K rows.
df['manager'] = mgr
df['filing_date'] = filing_date
df['report_date']= report_date
df['sh_name'] = sh_name
df['sh_class'] = sh_class
df['share_count'] = share_count
df['cusip'] = cusip
df['val1000'] = val1000
df['share_or_prin'] = share_or_prin
df['discretion'] = discretion
df['sole_vote_amt'] = sole_vote_amt
df['shared_vote_amt'] = shared_vote_amt


NameError: name 'mgr' is not defined

## Some exploratory data analysis 

In [16]:
df.manager.value_counts()

CITADEL ADVISORS LLC                            153725
FRANKLIN RESOURCES INC                           68589
AQR CAPITAL MANAGEMENT LLC                       60233
TWO SIGMA INVESTMENTS, LP                        29585
TWO SIGMA INVESTMENTS LLC                        23488
PDT Partners, LLC                                17281
ADAGE CAPITAL PARTNERS GP LLC                     7628
ADAGE CAPITAL PARTNERS GP, L.L.C.                 6969
Bridgewater Associates, LP                        6781
Schonfeld Strategic Advisors LLC                  5557
BERKSHIRE HATHAWAY INC                            3052
York Capital Management Global Advisors, LLC      1824
ELLIOTT MANAGEMENT CORP                           1683
TIGER GLOBAL MANAGEMENT LLC                        940
ICAHN CARL C                                       880
Third Point LLC                                    816
BAUPOST GROUP LLC/MA                               681
KING STREET CAPITAL MANAGEMENT, L.P.               275
Pershing S

In [8]:
orig_df = df #Just in case

  
## This data appears to make sense.  A few notes:

- It looks like two of the managers had name changes, I will clean the data below.
- It looks like the larger, more mutual fund-like managers had many more holdings.  This is expected as they likely have multiple submanagers with duplicate holdings.  I don't believe it will have a significant effect on the analysis, but on further study, it would be interesting to see what the strategy's performance would be like without them (or using them as a subgroup).

## It looks like a few managers are missing from the above.  I will quickly investigate to see if I can figure out why.  If not, I think I have enough data here to continue my analysis.

    

### The reason the managers were missing are as follows:

- 0001037389, Rennaissance Technologies:  They name each one of their infotables differently.  This shouldn't be allowed per the SEC's XML schema, but they seem to get away with it.  I won't bother trying to write a custom function to scrape their holdings.

- 0000937617, Davidson Kempner Cap. Mgt.:  All of their infotables are named "Form13FHRInformation.xml".  I re-ran the scraper , but there isn't much in the way of actual holdings.  It's possible I have the wrong CIK for them, but I will not investigate any further.

- 0001318757, Marshall Wace:  Similar to Ren. Tech's, but their naming scheme would be easier to decode.  Nonetheless, I'm not going to bother writing a custom function to scrape their data.  

In [17]:
df[df.manager.str.contains('SIGMA')]

Unnamed: 0,manager,filing_date,report_date,sh_name,sh_class,cusip,val1000,share_count,share_or_prin,discretion,sole_vote_amt,shared_vote_amt
67014,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,21VIANET GROUP INC,SPONSORED ADR,90138a103,4713,485861,SH,SOLE,485861,0
67015,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,2U INC,COM,90214j101,267,3200,SH,SOLE,3200,0
67016,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,2U INC,COM,90214j101,7155,85625,SH,SOLE,85625,0
67017,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,3-D SYS CORP DEL,COM NEW,88554d205,152,11000,SH,SOLE,11000,0
67018,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,3-D SYS CORP DEL,COM NEW,88554d205,262,19000,SH,SOLE,19000,0
67019,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,3M CO,COM,88579y101,47398,240943,SH,SOLE,240943,0
67020,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,3M CO,COM,88579y101,8223,41800,SH,SOLE,41800,0
67021,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,3M CO,COM,88579y101,9836,50000,SH,SOLE,50000,0
67022,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,51JOB INC,SP ADR REP COM,316827104,2911,29814,SH,SOLE,29814,0
67023,"TWO SIGMA INVESTMENTS, LP",2018-08-14,2018-06-30,58 COM INC,SPON ADR REP A,31680q104,24345,351097,SH,SOLE,351097,0


In [18]:
# creating a df of all values that are from Two Sigma (two different manager names) and resetting the manager name
two_sigma = df[df.manager.str.contains('SIGMA')] 
two_sigma.drop('manager', axis=1)
two_sigma['manager'] = 'TWO SIGMA INVESTMENTS'

# dropping the Two Sigma vals from the original df
df = df.drop(df[df.manager.str.contains('SIGMA')].index)

#appending the corrected dataframe back onto the original one
df = df.append(two_sigma) 

#Same thing in a single block of code for Adage
adage = df[df.manager.str.contains('ADAGE CAPITAL')]
adage.drop('manager', axis=1)
adage['manager'] = 'ADAGE CAPITAL PARTNERS GP'
df = df.drop(df[df.manager.str.contains('ADAGE')].index)
df = df.append(adage)


#I'm not positive why the below two errors are thrown, but the code works as expected.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
df[df.manager.str.contains('SIGMA')].head()

# Success!

Unnamed: 0,manager,filing_date,report_date,sh_name,sh_class,cusip,val1000,share_count,share_or_prin,discretion,sole_vote_amt,shared_vote_amt
67014,TWO SIGMA INVESTMENTS,2018-08-14,2018-06-30,21VIANET GROUP INC,SPONSORED ADR,90138a103,4713,485861,SH,SOLE,485861,0
67015,TWO SIGMA INVESTMENTS,2018-08-14,2018-06-30,2U INC,COM,90214j101,267,3200,SH,SOLE,3200,0
67016,TWO SIGMA INVESTMENTS,2018-08-14,2018-06-30,2U INC,COM,90214j101,7155,85625,SH,SOLE,85625,0
67017,TWO SIGMA INVESTMENTS,2018-08-14,2018-06-30,3-D SYS CORP DEL,COM NEW,88554d205,152,11000,SH,SOLE,11000,0
67018,TWO SIGMA INVESTMENTS,2018-08-14,2018-06-30,3-D SYS CORP DEL,COM NEW,88554d205,262,19000,SH,SOLE,19000,0


In [20]:
df[df.manager.str.contains('ADAGE')].head()

# Success again!

Unnamed: 0,manager,filing_date,report_date,sh_name,sh_class,cusip,val1000,share_count,share_or_prin,discretion,sole_vote_amt,shared_vote_amt
122451,ADAGE CAPITAL PARTNERS GP,2018-08-14,2018-06-30,3M CO,COM,88579Y101,139996,711652,SH,SOLE,711652,0
122452,ADAGE CAPITAL PARTNERS GP,2018-08-14,2018-06-30,AAR CORP,COM,000361105,903,19423,SH,SOLE,19423,0
122453,ADAGE CAPITAL PARTNERS GP,2018-08-14,2018-06-30,ABBOTT LABS,COM,002824100,203502,3336647,SH,SOLE,3336647,0
122454,ADAGE CAPITAL PARTNERS GP,2018-08-14,2018-06-30,ABBVIE INC,COM,00287Y109,94194,1016662,SH,SOLE,1016662,0
122455,ADAGE CAPITAL PARTNERS GP,2018-08-14,2018-06-30,ABEONA THERAPEUTICS INC,COM,00289Y107,36000,2250000,SH,SOLE,2250000,0


In [21]:
df.reset_index(inplace=True)
df = df.drop('index', axis=1)

In [22]:
df.manager.value_counts()

CITADEL ADVISORS LLC                            153725
FRANKLIN RESOURCES INC                           68589
AQR CAPITAL MANAGEMENT LLC                       60233
TWO SIGMA INVESTMENTS                            53073
PDT Partners, LLC                                17281
ADAGE CAPITAL PARTNERS GP                        14597
Bridgewater Associates, LP                        6781
Schonfeld Strategic Advisors LLC                  5557
BERKSHIRE HATHAWAY INC                            3052
York Capital Management Global Advisors, LLC      1824
ELLIOTT MANAGEMENT CORP                           1683
TIGER GLOBAL MANAGEMENT LLC                        940
ICAHN CARL C                                       880
Third Point LLC                                    816
BAUPOST GROUP LLC/MA                               681
KING STREET CAPITAL MANAGEMENT, L.P.               275
Pershing Square Capital Management, L.P.           181
Name: manager, dtype: int64

# Since the webscraper takes a very long time to run, I'm going to output the resulting dataframe to CSV now and will perform all subsequent analysis in a separate notebook.

In [39]:
df.to_csv("AllHoldings.csv", index=False)