In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import os

In [2]:
cik_list = ['1824937', '1736388', '1811231', '1446275', '1796036', '1824627', '1421744', '1588504', '1627282', '1621653', '1826050', '1742055', '1818808']

In [3]:
# functions to scrape and parse filings


# returns pandas dataframe with filing information by cik
def cik_pull(cik):
    endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

    # parameters dictionary
    param_dict = {'action':'getcompany',
                  'CIK':cik,
                  'type':'',
                  'dateb':'',
                  'owner':'exclude',
                  'start':'',
                  'output':'atom',
                  'count':'100'}


    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'lxml')

    entries = soup.find_all('entry')

    file_type_list = []
    file_date_list = []
    url_list = []
    acc_num_list = []

    for entry in entries:
        file_type_list.append(entry.find('filing-type').text)
        file_date_list.append(entry.find('filing-date').text)
        acc_num_list.append(entry.find('accession-number').text)
        url_list.append(entry.find('filing-href').text)
    

    filings_df = pd.DataFrame()
    filings_df['file_type'] = file_type_list
    filings_df['file_date'] = file_date_list
    filings_df['acc_num'] = acc_num_list
    filings_df['url'] = url_list


    #this is for simplicity dealing with urls
    temp_list = []
    for i in acc_num_list:
        temp_list.append(i.replace('-', ''))
    filings_df['acc_num_url'] = temp_list
    
    return filings_df

# returns dataframe of an individual filing as 1 row, and however many parsed fields as columns
def idv_document_parse(acc_num):
    root_url = 'https://www.sec.gov/Archives/edgar/data/'
    cik_num = cik
    acc_num_url = acc_num
    xml_end = 'primary_doc.xml'
    target_url = root_url + cik_num + '/' + acc_num_url + '/' + xml_end
    
    file = requests.get(target_url)
    file_data = BeautifulSoup(file.content, 'lxml')
    
    tag_list = []
    data_list = []
    for i in file_data.find_all():
        tag_list.append(i.name)
        data_list.append(i.string)
    file_df = pd.DataFrame(columns = tag_list)
    data_series = pd.Series(data_list, index = file_df.columns)
    file_df = file_df.append(data_series, ignore_index=True)
    file_df['acc_num'] = acc_num_url
    return file_df

# given a cik, returns dictionary with key value pair of unique file type - dataframe of parsed documents of that type
def all_files_parse(cik):
    filings_df = cik_pull(cik)
    
    file_dict = {}
    for file in filings_df['file_type'].unique():
        target_df = pd.DataFrame()
        iter_df = filings_df[filings_df['file_type'] == f'{file}']
        #print(iter_df['file_type'].iloc[0])
    
        for i in range(len(iter_df)):
            sub = idv_document_parse(str(iter_df['acc_num_url'].iloc[i]))
            sub = sub.loc[:, ~sub.columns.duplicated()] #note that this will nuke the 50+ columns of jurisdiction offered.
            target_df = target_df.append(sub).fillna(np.nan)
        file_dict[f'{file}'] = target_df
    return file_dict

In [4]:
%%time

company_data_dict = {}

for cik in cik_list:
    iter_dict = all_files_parse(cik)
    company_data_dict[cik] = iter_dict
    print('-'*64)
    print(f'{cik}')
    for file in iter_dict.keys():
          print(f'#   {file}')
    

----------------------------------------------------------------
1824937
#   C
----------------------------------------------------------------
1736388
#   C/A
#   C-AR
#   C
#   D
#   C-U
#   C-AR/A
----------------------------------------------------------------
1811231
#   D
#   C/A
#   C
----------------------------------------------------------------
1446275
#   1-SA
#   253G2
#   QUALIF
#   CORRESP
#   UPLOAD
#   1-A/A
#   C-AR/A
#   1-A
#   C-AR
#   C-U
#   D
#   C
#   D/A
#   C/A
#   REGDEX
----------------------------------------------------------------
1796036
#   C/A
#   C-AR
#   C
----------------------------------------------------------------
1824627
#   C
----------------------------------------------------------------
1421744
#   253G2
#   QUALIF
#   CORRESP
#   1-A/A
#   UPLOAD
#   1-A
#   C-AR
#   D
#   C-U
#   C
#   D/A
#   REGDEX
----------------------------------------------------------------
1588504
#   1-A-W
#   1-SA
#   253G2
#   QUALIF
#   CORRESP
#   1-A/A
#  

In [5]:
company_data_dict.keys()

dict_keys(['1824937', '1736388', '1811231', '1446275', '1796036', '1824627', '1421744', '1588504', '1627282', '1621653', '1826050', '1742055', '1818808'])

In [6]:
# this generates a dictionary for each unique filetype. It is structured similarly to the above, but more efficient to work with for slicing into tables.

all_filing_types = []
filing_dict = {}

for key in company_data_dict.keys():
    for file in company_data_dict[key]:
        all_filing_types.append(file)

all_filing_types = set(all_filing_types) #gives all unique file types across all ciks pulled
all_filing_types = list(all_filing_types)
for i in all_filing_types:
     filing_dict[i] = pd.DataFrame()

for key in company_data_dict.keys():
    for file in company_data_dict[key]:
        filing_dict[file] = filing_dict[file].append(company_data_dict[key][file])
        
filing_dict['C'].head()

Unnamed: 0,html,body,edgarsubmission,headerdata,submissiontype,filerinfo,filer,filercredentials,filercik,filerccc,...,issuer,issuertitle,signaturepersons,signatureperson,personsignature,persontitle,signaturedate,acc_num,crdnumber,com:street2
0,,,,,C,,,,1824937,XXXXXXXX,...,"Shacksbury Holdings, Inc.","President, CEO, Principal Financial Officer, C...",,,Colin Davis,"President, CEO, Principal Financial Officer, C...",09-29-2020,166516020001215,,
0,,,,,C,,,,1736388,XXXXXXXX,...,"R3 Printing, Inc.",Director; Treasurer; Head of Product; Chief Ex...,,,Petra Wood,Director; Vice President; Head of Growth; Chie...,03-17-2020,166516020000248,,
0,,,,,C,,,,1736388,XXXXXXXX,...,"R3 Printing, Inc.",President,,,Daniel Downs,Director,05-20-2019,173638819000005,283874.0,
0,,,,,C,,,,1736388,XXXXXXXX,...,"R3 Printing, Inc.",President,,,Daniel Downs,Director,04-11-2018,173638818000001,283874.0,
0,,,,,C,,,,1811231,XXXXXXXX,...,"Solectrac, Inc.","CEO, Principal Executive Officer and Director,...",,,Stephen Heckeroth,"CEO, Principal Executive Officer and Director,...",05-28-2020,166516020000670,,


In [7]:
# these next couple cells are temporary. Just wanted quick access to the index
counter = 0
for i in filing_dict['C'].columns:
    print(f'index {counter} - {i}')
    counter += 1

index 0 - html
index 1 - body
index 2 - edgarsubmission
index 3 - headerdata
index 4 - submissiontype
index 5 - filerinfo
index 6 - filer
index 7 - filercredentials
index 8 - filercik
index 9 - filerccc
index 10 - livetestflag
index 11 - flags
index 12 - confirmingcopyflag
index 13 - returncopyflag
index 14 - overrideinternetflag
index 15 - formdata
index 16 - issuerinformation
index 17 - issuerinfo
index 18 - nameofissuer
index 19 - legalstatus
index 20 - legalstatusform
index 21 - legalstatusotherdesc
index 22 - jurisdictionorganization
index 23 - dateincorporation
index 24 - issueraddress
index 25 - com:street1
index 26 - com:city
index 27 - com:stateorcountry
index 28 - com:zipcode
index 29 - issuerwebsite
index 30 - companyname
index 31 - commissioncik
index 32 - commissionfilenumber
index 33 - offeringinformation
index 34 - compensationamount
index 35 - financialinterest
index 36 - securityofferedtype
index 37 - securityofferedotherdesc
index 38 - noofsecurityoffered
index 39 - p

In [8]:
counter = 0
for i in filing_dict['C/A'].columns:
    print(f'index {counter} - {i}')
    counter += 1

index 0 - html
index 1 - body
index 2 - edgarsubmission
index 3 - headerdata
index 4 - submissiontype
index 5 - filerinfo
index 6 - filer
index 7 - filercredentials
index 8 - filercik
index 9 - filerccc
index 10 - filenumber
index 11 - livetestflag
index 12 - flags
index 13 - confirmingcopyflag
index 14 - returncopyflag
index 15 - overrideinternetflag
index 16 - formdata
index 17 - issuerinformation
index 18 - isamendment
index 19 - natureofamendment
index 20 - issuerinfo
index 21 - nameofissuer
index 22 - legalstatus
index 23 - legalstatusform
index 24 - legalstatusotherdesc
index 25 - jurisdictionorganization
index 26 - dateincorporation
index 27 - issueraddress
index 28 - com:street1
index 29 - com:city
index 30 - com:stateorcountry
index 31 - com:zipcode
index 32 - issuerwebsite
index 33 - companyname
index 34 - commissioncik
index 35 - commissionfilenumber
index 36 - offeringinformation
index 37 - compensationamount
index 38 - financialinterest
index 39 - securityofferedtype
index

In [9]:
counter = 0
for i in filing_dict['1-A'].columns:
    print(f'index {counter} - {i}')
    counter += 1

index 0 - html
index 1 - body
index 2 - edgarsubmission
index 3 - headerdata
index 4 - submissiontype
index 5 - filerinfo
index 6 - livetestflag
index 7 - filer
index 8 - issuercredentials
index 9 - cik
index 10 - ccc
index 11 - formdata
index 12 - employeesinfo
index 13 - issuername
index 14 - jurisdictionorganization
index 15 - yearincorporation
index 16 - siccode
index 17 - irsnum
index 18 - fulltimeemployees
index 19 - parttimeemployees
index 20 - issuerinfo
index 21 - street1
index 22 - city
index 23 - stateorcountry
index 24 - zipcode
index 25 - phonenumber
index 26 - connectionname
index 27 - industrygroup
index 28 - cashequivalents
index 29 - investmentsecurities
index 30 - accountsreceivable
index 31 - propertyplantequipment
index 32 - totalassets
index 33 - accountspayable
index 34 - longtermdebt
index 35 - totalliabilities
index 36 - totalstockholderequity
index 37 - totalliabilitiesandequity
index 38 - totalrevenues
index 39 - costandexpensesappltorevenues
index 40 - deprec

In [10]:
counter = 0
for i in filing_dict['1-A/A'].columns:
    print(f'index {counter} - {i}')
    counter += 1

index 0 - html
index 1 - body
index 2 - edgarsubmission
index 3 - headerdata
index 4 - submissiontype
index 5 - filerinfo
index 6 - livetestflag
index 7 - filer
index 8 - issuercredentials
index 9 - cik
index 10 - ccc
index 11 - offeringfilenumber
index 12 - flags
index 13 - sincelastfiling
index 14 - formdata
index 15 - employeesinfo
index 16 - issuername
index 17 - jurisdictionorganization
index 18 - yearincorporation
index 19 - siccode
index 20 - irsnum
index 21 - fulltimeemployees
index 22 - parttimeemployees
index 23 - issuerinfo
index 24 - street1
index 25 - city
index 26 - stateorcountry
index 27 - zipcode
index 28 - phonenumber
index 29 - connectionname
index 30 - industrygroup
index 31 - cashequivalents
index 32 - investmentsecurities
index 33 - accountsreceivable
index 34 - propertyplantequipment
index 35 - totalassets
index 36 - accountspayable
index 37 - longtermdebt
index 38 - totalliabilities
index 39 - totalstockholderequity
index 40 - totalliabilitiesandequity
index 41 

In [11]:
# dependencies for transforming data into tables for export
import json
import sqlalchemy as SQL
import pymongo
from pymongo import MongoClient

In [23]:
# tables

filings_table = pd.DataFrame() # PK acc_num, FK CIK num, file_type, date_filed

company_table = pd.DataFrame() #PK CIK - name, address, phone number, contact information
    # maybe split contact information from customer
issuer_table = pd.DataFrame() #PK acc_number - name, address, legal status
financial_table_C = pd.DataFrame() # PK acc_number
financial_table_D = pd.DataFrame() # PK acc_number
financial_table_1_A = pd.DataFrame() # PK acc_number

# returns list of keys that provide dataframes with data, corresponding to what files have an xml file on EDGAR
all_dfs = set(filing_dict.keys())
error_dfs = set(['DOS', '1-A-W', 'CORRESP', '1-U', 'UPLOAD', 'DOSLTR', 'DOS/A', 'REGDEX', '253G2', '1-SA', 'QUALIF']) #QUALIF is odd and needs attention
success_dfs =  all_dfs - error_dfs
success_dfs

{'1-A',
 '1-A POS',
 '1-A/A',
 '1-K',
 '1-K/A',
 'C',
 'C-AR',
 'C-AR/A',
 'C-U',
 'C/A',
 'D',
 'D/A'}

In [24]:
# filings table

"""
This is going to need more cleanup. Due to the other types of documents within the data ingest, a try-except needs to be implemented for each one.
for the moment, this is proof of concept for how the tables will fit together in the schema, and how it flows from EDGAR into these tables then into mongo or SQL
"""



for i in success_dfs:
    iter_df = pd.DataFrame()
    
    iter_df['acc_num'] = filing_dict[i]['acc_num']
    try:
        iter_df['cik'] = filing_dict[i]['cik']
    except:
        iter_df['cik'] = filing_dict[i]['filercik']
    iter_df['filing_type'] = filing_dict[i]['submissiontype']
    filings_table = filings_table.append(iter_df, ignore_index=True)
    
    
print(filings_table)

#filing table to json for mongo, creates list of dictionaries, one for each row.
filings_json = json.loads(filings_table.to_json(orient='records'))

print(filings_json[0:5])

                acc_num         cik filing_type
0    000166516020001202  0001736388         C/A
1    000166516020001037  0001736388         C/A
2    000166516020000792  0001736388         C/A
3    000166516020000548  0001736388         C/A
4    000173638819000008  0001736388         C/A
..                  ...         ...         ...
156  000167025418000202  0001446275        C-AR
157  000166516020000556  0001796036        C-AR
158  000142174420000003  0001421744        C-AR
159  000174205520000002  0001742055        C-AR
160  000174205519000001  0001742055        C-AR

[161 rows x 3 columns]
[{'acc_num': '000166516020001202', 'cik': '0001736388', 'filing_type': 'C/A'}, {'acc_num': '000166516020001037', 'cik': '0001736388', 'filing_type': 'C/A'}, {'acc_num': '000166516020000792', 'cik': '0001736388', 'filing_type': 'C/A'}, {'acc_num': '000166516020000548', 'cik': '0001736388', 'filing_type': 'C/A'}, {'acc_num': '000173638819000008', 'cik': '0001736388', 'filing_type': 'C/A'}]


In [25]:
# Testing insertion into mongo

api_string = 'mongodb+srv://fitz:Fearfulsymmetry99@ssa-data.xspaw.mongodb.net/<dbname>?retryWrites=true&w=majority'
cluster = MongoClient(api_string)

db = cluster['ssa-data'] #personal cluster, will change to company when ready for production
collection = db['filing-data']


In [26]:
collection.insert_many(filings_json)

<pymongo.results.InsertManyResult at 0x284cd86a3c8>

In [27]:
result = collection.find({"filing_type" : "C"})
for i in result:
    print(i)

{'_id': ObjectId('5fc7f47d824725b891b4dcd0'), 'acc_num': '000166516020001215', 'cik': '0001824937', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dcd1'), 'acc_num': '000166516020000248', 'cik': '0001736388', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dcd2'), 'acc_num': '000173638819000005', 'cik': '0001736388', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dcd3'), 'acc_num': '000173638818000001', 'cik': '0001736388', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dcd4'), 'acc_num': '000166516020000670', 'cik': '0001811231', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dcd5'), 'acc_num': '000166516020000007', 'cik': '0001446275', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dcd6'), 'acc_num': '000167025418000335', 'cik': '0001446275', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dcd7'), 'acc_num': '000167025416000158', 'cik': '0001446275', 'filing_type': 'C'}
{'_id': ObjectId('5fc7f47d824725b891b4dc

In [29]:
# company_table
