In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
cik_list = ['0001786874']

In [3]:
# functions to scrape and parse filings


# returns pandas dataframe with filing information by cik
def cik_pull(cik):
    endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

    # parameters dictionary
    param_dict = {'action':'getcompany',
                  'CIK':cik,
                  'type':'',
                  'dateb':'',
                  'owner':'exclude',
                  'start':'',
                  'output':'atom',
                  'count':'100'}


    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'lxml')

    entries = soup.find_all('entry')

    file_type_list = []
    file_date_list = []
    url_list = []
    acc_num_list = []

    for entry in entries:
        file_type_list.append(entry.find('filing-type').text)
        file_date_list.append(entry.find('filing-date').text)
        acc_num_list.append(entry.find('accession-number').text)
        url_list.append(entry.find('filing-href').text)
    

    filings_df = pd.DataFrame()
    filings_df['file_type'] = file_type_list
    filings_df['file_date'] = file_date_list
    filings_df['acc_num'] = acc_num_list
    filings_df['url'] = url_list


    #this is for simplicity dealing with urls
    temp_list = []
    for i in acc_num_list:
        temp_list.append(i.replace('-', ''))
    filings_df['acc_num_url'] = temp_list
    
    return filings_df

# returns dataframe of an individual filing as 1 row, and however many parsed fields as columns
def idv_document_parse(acc_num):
    root_url = 'https://www.sec.gov/Archives/edgar/data/'
    cik_num = cik
    acc_num_url = acc_num
    xml_end = 'primary_doc.xml'
    target_url = root_url + cik_num + '/' + acc_num_url + '/' + xml_end
    
    file = requests.get(target_url)
    file_data = BeautifulSoup(file.content, 'lxml')
    
    tag_list = []
    data_list = []
    for i in file_data.find_all():
        tag_list.append(i.name)
        data_list.append(i.string)
    file_df = pd.DataFrame(columns = tag_list)
    data_series = pd.Series(data_list, index = file_df.columns)
    file_df = file_df.append(data_series, ignore_index=True)
    return file_df

# given a cik, returns dictionary with key value pair of unique file type - dataframe of parsed documents of that type
def all_files_parse(cik):
    filings_df = cik_pull(cik)
    
    file_dict = {}
    for file in filings_df['file_type'].unique():
        target_df = pd.DataFrame()
        iter_df = filings_df[filings_df['file_type'] == f'{file}']
        print(iter_df['file_type'].iloc[0])
    
        for i in range(len(iter_df)):
            sub = idv_document_parse(str(iter_df['acc_num_url'].iloc[i]))
            sub = sub.loc[:, ~sub.columns.duplicated()] #note that this will nuke the 50+ columns of jurisdiction offered.
            target_df = target_df.append(sub).fillna(np.nan)
        file_dict[f'{file}'] = target_df
    return file_dict

In [None]:
%%time

company_data_dict = {}

for cik in cik_list:
    iter_dict = all_files_parse(cik)
    company_data_dict[cik] = iter_dict
    print('-'*64)
    print(f'{cik})
    for file in iter_dict.keys():
          print(f'#   {file}')
    