In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import os

In [2]:
cik_list = ['1824937', '1736388', '1811231', '1446275', '1796036', '1824627', '1421744', '1588504', '1627282', '1621653', '1826050', '1742055', '1818808']

In [3]:
# functions to scrape and parse filings


# returns pandas dataframe with filing information by cik
def cik_pull(cik):
    endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

    # parameters dictionary
    param_dict = {'action':'getcompany',
                  'CIK':cik,
                  'type':'',
                  'dateb':'',
                  'owner':'exclude',
                  'start':'',
                  'output':'atom',
                  'count':'100'}


    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'lxml')

    entries = soup.find_all('entry')

    file_type_list = []
    file_date_list = []
    url_list = []
    acc_num_list = []

    for entry in entries:
        file_type_list.append(entry.find('filing-type').text)
        file_date_list.append(entry.find('filing-date').text)
        acc_num_list.append(entry.find('accession-number').text)
        url_list.append(entry.find('filing-href').text)
    

    filings_df = pd.DataFrame()
    filings_df['file_type'] = file_type_list
    filings_df['file_date'] = file_date_list
    filings_df['acc_num'] = acc_num_list
    filings_df['url'] = url_list


    #this is for simplicity dealing with urls
    temp_list = []
    for i in acc_num_list:
        temp_list.append(i.replace('-', ''))
    filings_df['acc_num_url'] = temp_list
    
    return filings_df

# returns dataframe of an individual filing as 1 row, and however many parsed fields as columns
def idv_document_parse(acc_num):
    root_url = 'https://www.sec.gov/Archives/edgar/data/'
    cik_num = cik
    acc_num_url = acc_num
    xml_end = 'primary_doc.xml'
    target_url = root_url + cik_num + '/' + acc_num_url + '/' + xml_end
    
    file = requests.get(target_url)
    file_data = BeautifulSoup(file.content, 'lxml')
    
    tag_list = []
    data_list = []
    for i in file_data.find_all():
        tag_list.append(i.name)
        data_list.append(i.string)
    file_df = pd.DataFrame(columns = tag_list)
    data_series = pd.Series(data_list, index = file_df.columns)
    file_df = file_df.append(data_series, ignore_index=True)
    file_df['acc_num'] = acc_num_url
    return file_df

# given a cik, returns dictionary with key value pair of unique file type - dataframe of parsed documents of that type
def all_files_parse(cik):
    filings_df = cik_pull(cik)
    
    file_dict = {}
    for file in filings_df['file_type'].unique():
        target_df = pd.DataFrame()
        iter_df = filings_df[filings_df['file_type'] == f'{file}']
        #print(iter_df['file_type'].iloc[0])
    
        for i in range(len(iter_df)):
            sub = idv_document_parse(str(iter_df['acc_num_url'].iloc[i]))
            sub = sub.loc[:, ~sub.columns.duplicated()] #note that this will nuke the 50+ columns of jurisdiction offered.
            target_df = target_df.append(sub).fillna(np.nan)
        file_dict[f'{file}'] = target_df
    return file_dict

In [4]:
%%time

company_data_dict = {}

for cik in cik_list:
    iter_dict = all_files_parse(cik)
    company_data_dict[cik] = iter_dict
    print('-'*64)
    print(f'{cik}')
    for file in iter_dict.keys():
          print(f'#   {file}')
    

----------------------------------------------------------------
1824937
#   C
----------------------------------------------------------------
1736388
#   C/A
#   C-AR
#   C
#   D
#   C-U
#   C-AR/A
----------------------------------------------------------------
1811231
#   D
#   C/A
#   C
----------------------------------------------------------------
1446275
#   1-SA
#   253G2
#   QUALIF
#   CORRESP
#   UPLOAD
#   1-A/A
#   C-AR/A
#   1-A
#   C-AR
#   C-U
#   D
#   C
#   D/A
#   C/A
#   REGDEX
----------------------------------------------------------------
1796036
#   C/A
#   C-AR
#   C
----------------------------------------------------------------
1824627
#   C
----------------------------------------------------------------
1421744
#   253G2
#   QUALIF
#   CORRESP
#   1-A/A
#   UPLOAD
#   1-A
#   C-AR
#   D
#   C-U
#   C
#   D/A
#   REGDEX
----------------------------------------------------------------
1588504
#   1-A-W
#   1-SA
#   253G2
#   QUALIF
#   CORRESP
#   1-A/A
#  

In [5]:
company_data_dict.keys()

dict_keys(['1824937', '1736388', '1811231', '1446275', '1796036', '1824627', '1421744', '1588504', '1627282', '1621653', '1826050', '1742055', '1818808'])

In [6]:
# this generates a dictionary for each unique filetype. It is structured similarly to the above, but more efficient to work with for slicing into tables.

all_filing_types = []
filing_dict = {}

for key in company_data_dict.keys():
    for file in company_data_dict[key]:
        all_filing_types.append(file)

all_filing_types = set(all_filing_types) #gives all unique file types across all ciks pulled
all_filing_types = list(all_filing_types)
for i in all_filing_types:
     filing_dict[i] = pd.DataFrame()

for key in company_data_dict.keys():
    for file in company_data_dict[key]:
        filing_dict[file] = filing_dict[file].append(company_data_dict[key][file])
        
filing_dict['C'].head()

Unnamed: 0,html,body,edgarsubmission,headerdata,submissiontype,filerinfo,filer,filercredentials,filercik,filerccc,...,issuer,issuertitle,signaturepersons,signatureperson,personsignature,persontitle,signaturedate,acc_num,crdnumber,com:street2
0,,,,,C,,,,1824937,XXXXXXXX,...,"Shacksbury Holdings, Inc.","President, CEO, Principal Financial Officer, C...",,,Colin Davis,"President, CEO, Principal Financial Officer, C...",09-29-2020,166516020001215,,
0,,,,,C,,,,1736388,XXXXXXXX,...,"R3 Printing, Inc.",Director; Treasurer; Head of Product; Chief Ex...,,,Petra Wood,Director; Vice President; Head of Growth; Chie...,03-17-2020,166516020000248,,
0,,,,,C,,,,1736388,XXXXXXXX,...,"R3 Printing, Inc.",President,,,Daniel Downs,Director,05-20-2019,173638819000005,283874.0,
0,,,,,C,,,,1736388,XXXXXXXX,...,"R3 Printing, Inc.",President,,,Daniel Downs,Director,04-11-2018,173638818000001,283874.0,
0,,,,,C,,,,1811231,XXXXXXXX,...,"Solectrac, Inc.","CEO, Principal Executive Officer and Director,...",,,Stephen Heckeroth,"CEO, Principal Executive Officer and Director,...",05-28-2020,166516020000670,,


In [7]:
# dependencies for transforming data into tables for export
import json
import sqlalchemy as SQL
import pymongo
from pymongo import MongoClient

In [8]:
# tables

filings_table = pd.DataFrame() # PK acc_num, FK CIK num, file_type, date_filed

company_table = pd.DataFrame() #PK CIK - name, address, phone number, contact information
    # maybe split contact information from customer
issuer_table = pd.DataFrame() #PK acc_number FK CIK - name, address, legal status
financial_table_C = pd.DataFrame() # PK acc_number
financial_table_D = pd.DataFrame() # PK acc_number
financial_table_1_A = pd.DataFrame() # PK acc_number

# returns list of keys that provide dataframes with data, corresponding to what files have an xml file on EDGAR
all_dfs = set(filing_dict.keys())
error_dfs = set(['DOS', '1-A-W', 'CORRESP', '1-U', 'UPLOAD', 'DOSLTR', 'DOS/A', 'REGDEX', '253G2', '1-SA', 'QUALIF']) #QUALIF is odd and needs attention
success_dfs =  list(all_dfs - error_dfs)
success_dfs

['C-AR',
 '1-A POS',
 'C/A',
 '1-A',
 'D/A',
 '1-K/A',
 'C-AR/A',
 '1-A/A',
 'C-U',
 '1-K',
 'D',
 'C']

In [None]:
# This is for reference of the index of columns in each file type for use in splicing. This cell will get deleted when it isn't needed anymore.
for i in success_dfs:
    counter = 0
    print('')
    print(f'--- {i}---')
    for i in filing_dict[i].columns:
        print(f'index {counter} - {i}')
        counter += 1

In [None]:
# filings table

"""
This is going to need more cleanup. Due to the other types of documents within the data ingest, a try-except needs to be implemented for each one.
for the moment, this is proof of concept for how the tables will fit together in the schema, and how it flows from EDGAR into these tables then into mongo or SQL.
The other filings are just going to need closer attention to decide if parsing some of the .txt documents are worth it, or if they should just ignored.
"""



for i in success_dfs:
    iter_df = pd.DataFrame()
    
    iter_df['acc_num'] = filing_dict[i]['acc_num']
    try:
        iter_df['cik'] = filing_dict[i]['cik']
    except:
        iter_df['cik'] = filing_dict[i]['filercik']
    iter_df['filing_type'] = filing_dict[i]['submissiontype']
    filings_table = filings_table.append(iter_df, ignore_index=True)
    
    
print(filings_table)

#filing table to json for mongo, creates list of dictionaries, one for each row.
filings_json = json.loads(filings_table.to_json(orient='records'))

print(filings_json[0:5])

In [None]:
# Testing insertion into mongo

api_string = 'mongodb+srv://fitz:Fearfulsymmetry99@ssa-data.xspaw.mongodb.net/<dbname>?retryWrites=true&w=majority'
cluster = MongoClient(api_string)

db = cluster['ssa-data'] #personal cluster, will change to company when ready for production
collection = db['filing-data']


In [None]:
collection.insert_many(filings_json)

In [None]:
result = collection.find({"filing_type" : "C"})
for i in result:
    print(i)

In [None]:
# company_table
company_files = ['C', 'D', '1-A']
company_ciks = []

for i in company_data_dict.keys():
    company_ciks.append(i)
    for x in company_data_dict[i].keys():
        print(type(company_data_dict[i][x]))


In [None]:
company_cik