In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import os
import time

In [10]:
url = 'https://www.sec.gov/cgi-bin/current?'
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
            }
response = requests.get(url = url, headers = headers)
soup = BeautifulSoup(response.content)

In [11]:
# soup

In [12]:
raw_cik_list = []
for i in soup.findAll('a'):
    raw_cik_list.append(i.text)
raw_cik_list = raw_cik_list[:-1] # removes the last tag, which links to elsewhere on EDGAR

processed_cik_list = []
processed_file_list = []
for x in range(1, len(raw_cik_list), 2):
    processed_cik_list.append(raw_cik_list[x])
for x in range(0, len(raw_cik_list), 2):
    processed_file_list.append(raw_cik_list[x])

print(len(raw_cik_list))
print(len(processed_cik_list))
print(len(processed_file_list))

8824
4412
4412


In [13]:
file_df = pd.DataFrame()

file_df['cik'] = processed_cik_list
file_df['type'] = processed_file_list

file_df['cik'].value_counts()

898745     76
1114446    55
744822     42
831001     36
200245     36
           ..
1709833     1
1625087     1
1825437     1
1725927     1
926678      1
Name: cik, Length: 2587, dtype: int64

In [14]:
file_df['type'].value_counts()

4           1775
424B2        340
NPORT-P      301
8-K          272
D            164
            ... 
25             1
NT N-CEN       1
MA             1
DEL AM         1
15-12G         1
Name: type, Length: 130, dtype: int64

In [15]:
# functions to scrape and parse filings


# returns pandas dataframe with filing information by cik
def cik_pull(cik):
    endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

    # parameters dictionary
    param_dict = {'action':'getcompany',
                  'CIK':cik,
                  'type':'',
                  'dateb':'',
                  'owner':'exclude',
                  'start':'',
                  'output':'atom',
                  'count':'100'}


    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'lxml')

    entries = soup.find_all('entry')

    file_type_list = []
    file_date_list = []
    url_list = []
    acc_num_list = []

    for entry in entries:
        file_type_list.append(entry.find('filing-type').text)
        file_date_list.append(entry.find('filing-date').text)
        acc_num_list.append(entry.find('accession-number').text)
        url_list.append(entry.find('filing-href').text)
    

    filings_df = pd.DataFrame()
    filings_df['file_type'] = file_type_list
    filings_df['file_date'] = file_date_list
    filings_df['acc_num'] = acc_num_list
    filings_df['url'] = url_list


    #this is for simplicity dealing with urls
    temp_list = []
    for i in acc_num_list:
        temp_list.append(i.replace('-', ''))
    filings_df['acc_num_url'] = temp_list
    
    return filings_df

# returns dataframe of an individual filing as 1 row, and however many parsed fields as columns
def idv_document_parse(acc_num):
    root_url = 'https://www.sec.gov/Archives/edgar/data/'
    cik_num = cik
    acc_num_url = acc_num
    xml_end = 'primary_doc.xml'
    target_url = root_url + cik_num + '/' + acc_num_url + '/' + xml_end
    
    file = requests.get(target_url)
    file_data = BeautifulSoup(file.content, 'lxml')
    
    tag_list = []
    data_list = []
    for i in file_data.find_all():
        tag_list.append(i.name)
        data_list.append(i.string)
    file_df = pd.DataFrame(columns = tag_list)
    data_series = pd.Series(data_list, index = file_df.columns)
    file_df = file_df.append(data_series, ignore_index=True)
    file_df['acc_num'] = acc_num_url
    return file_df

# given a cik, returns dictionary with key value pair of unique file type - dataframe of parsed documents of that type
def all_files_parse(cik):
    filings_df = cik_pull(cik)
    
    file_dict = {}
    for file in filings_df['file_type'].unique():
        target_df = pd.DataFrame()
        iter_df = filings_df[filings_df['file_type'] == f'{file}']
        #print(iter_df['file_type'].iloc[0])
    
        for i in range(len(iter_df)):
            sub = idv_document_parse(str(iter_df['acc_num_url'].iloc[i]))
            sub = sub.loc[:, ~sub.columns.duplicated()] #note that this will nuke the 50+ columns of jurisdiction offered.
            target_df = target_df.append(sub).fillna(np.nan)
        file_dict[f'{file}'] = target_df
    return file_dict

In [8]:
%%time

company_data_dict = {}
counter = 0
for cik in processed_cik_list:
    iter_dict = all_files_parse(cik)
    company_data_dict[cik] = iter_dict
    print('-'*64)
    print(f'{cik} - {counter}')
    counter += 1
    for file in iter_dict.keys():
           print(f'#   {file}')
    time.sleep(1)
print(len(company_data_dict))

----------------------------------------------------------------
1821595 - 0
#   8-K
#   SC 13G
#   424B4
#   CERT
#   EFFECT
#   8-A12B
#   S-1/A
#   S-1
#   DRS
----------------------------------------------------------------
1770787 - 1
#   10-Q
#   8-K
#   S-3ASR
#   424B4
#   EFFECT
#   S-1MEF
#   CORRESP
#   S-1
#   UPLOAD
#   DRS
#   SC 13G/A
#   DEFA14A
#   DEF 14A
#   10-K
#   SC 13G
#   SC 13D
#   424B3
#   S-8
#   FWP
#   S-1/A
#   CERT
#   8-A12B
#   SEC STAFF LETTER
#   DRSLTR
#   DRS/A
#   D
----------------------------------------------------------------
1831518 - 2
#   D
----------------------------------------------------------------
1404123 - 3
#   10-Q
#   8-K
#   SC 13G/A
#   SC 13G
#   424B4
#   EFFECT
#   S-1MEF
#   CORRESP
#   S-1
#   UPLOAD
#   DRS
#   10-K
#   S-8
#   CERT
#   8-A12B
#   S-1/A
#   SEC STAFF LETTER
#   DRSLTR
#   DRS/A
#   D
#   REGDEX
----------------------------------------------------------------
1748680 - 4
#   N-23C3A
#   NPORT-P
#   APP OR

SSLError: HTTPSConnectionPool(host='www.sec.gov', port=443): Max retries exceeded with url: /Archives/edgar/data/66740/000155837018005773/primary_doc.xml (Caused by SSLError(SSLError("bad handshake: SysCallError(10060, 'WSAETIMEDOUT')")))