In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import pprint

In [2]:
cik = '0001786874'

In [3]:
# returns pandas dataframe with filing information by cik
def cik_pull(cik):
    endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

    # define our parameters dictionary
    param_dict = {'action':'getcompany',
                  'CIK':cik,
                  'type':'',
                  'dateb':'',
                  'owner':'exclude',
                  'start':'',
                  'output':'atom',
                  'count':'100'}

    # request the url, and then parse the response.
    response = requests.get(url = endpoint, params = param_dict)
    soup = BeautifulSoup(response.content, 'lxml')

    entries = soup.find_all('entry')

    file_type_list = []
    file_date_list = []
    url_list = []
    acc_num_list = []

    for entry in entries:
        file_type_list.append(entry.find('filing-type').text)
        file_date_list.append(entry.find('filing-date').text)
        acc_num_list.append(entry.find('accession-number').text)
        url_list.append(entry.find('filing-href').text)
    

    filings_df = pd.DataFrame()
    filings_df['file_type'] = file_type_list
    filings_df['file_date'] = file_date_list
    filings_df['acc_num'] = acc_num_list
    filings_df['url'] = url_list


    #this is for simplicity dealing with urls
    temp_list = []
    for i in acc_num_list:
        temp_list.append(i.replace('-', ''))
    filings_df['acc_num_url'] = temp_list
    
    return filings_df

In [4]:
# returns dataframe of an individual filing as 1 row, and however many parsed fields as columns
def idv_document_parse(acc_num):
    root_url = 'https://www.sec.gov/Archives/edgar/data/'
    cik_num = cik
    acc_num_url = acc_num
    xml_end = 'primary_doc.xml'
    target_url = root_url + cik_num + '/' + acc_num_url + '/' + xml_end
    
    file = requests.get(target_url)
    file_data = BeautifulSoup(file.content, 'lxml')
    
    tag_list = []
    data_list = []
    for i in file_data.find_all():
        tag_list.append(i.name)
        data_list.append(i.string)
    file_df = pd.DataFrame(columns = tag_list)
    data_series = pd.Series(data_list, index = file_df.columns)
    file_df = file_df.append(data_series, ignore_index=True)
    return file_df

In [5]:
# given a cik, returns dictionary with key value pair of unique file type - dataframe of parsed documents of that type
def all_files_parse(cik):
    filings_df = cik_pull(cik)
    
    file_dict = {}
    for file in filings_df['file_type'].unique():
        target_df = pd.DataFrame()
        iter_df = filings_df[filings_df['file_type'] == f'{file}']
        print(iter_df['file_type'].iloc[0])
    
        for i in range(len(iter_df)):
            sub = idv_document_parse(str(iter_df['acc_num_url'].iloc[i]))
            sub = sub.loc[:, ~sub.columns.duplicated()]
            target_df = target_df.append(sub).fillna(np.nan)
        file_dict[f'{file}'] = target_df
    return file_dict

In [6]:
# All fiings for a given cik
file_df = cik_pull(cik)
file_df

Unnamed: 0,file_type,file_date,acc_num,url,acc_num_url
0,1-SA,2020-11-24,0001829126-20-000176,https://www.sec.gov/Archives/edgar/data/178687...,182912620000176
1,253G2,2020-08-28,0001213900-20-024150,https://www.sec.gov/Archives/edgar/data/178687...,121390020024150
2,253G2,2020-08-25,0001213900-20-023574,https://www.sec.gov/Archives/edgar/data/178687...,121390020023574
3,QUALIF,2020-08-20,9999999994-20-000171,https://www.sec.gov/Archives/edgar/data/178687...,999999999420000171
4,CORRESP,2020-08-18,0001213900-20-022712,https://www.sec.gov/Archives/edgar/data/178687...,121390020022712
5,CORRESP,2020-08-07,0001213900-20-020956,https://www.sec.gov/Archives/edgar/data/178687...,121390020020956
6,1-A/A,2020-08-07,0001213900-20-020951,https://www.sec.gov/Archives/edgar/data/178687...,121390020020951
7,UPLOAD,2020-08-03,0000000000-20-007078,https://www.sec.gov/Archives/edgar/data/178687...,20007078
8,CORRESP,2020-07-21,0001213900-20-018131,https://www.sec.gov/Archives/edgar/data/178687...,121390020018131
9,1-A/A,2020-07-21,0001213900-20-018128,https://www.sec.gov/Archives/edgar/data/178687...,121390020018128


In [7]:
# this is parsing the C filing as an example. inputting acc_num without dashes will parse any document you request.
idv_document_parse(str(file_df['acc_num_url'].iloc[-1]))

Unnamed: 0,html,body,edgarsubmission,headerdata,submissiontype,filerinfo,filer,filercredentials,filercik,filerccc,...,signatureinfo,issuersignature,issuer,issuersignature.1,issuertitle,signaturepersons,signatureperson,personsignature,persontitle,signaturedate
0,,,,,C,,,,1786874,XXXXXXXX,...,,,"Called Higher Studios, Inc.",Jason Brown,"CEO, Principal Executive Officer and Director,...",,,Jason Brown,"CEO, Principal Executive Officer and Director,...",10-29-2019


In [8]:
# This is the result if there is no XML file for that acc_num.
idv_document_parse(str(file_df['acc_num_url'].iloc[0]))

Unnamed: 0,html,body,error,code,message,key,requestid,hostid
0,,,,NoSuchKey,The specified key does not exist.,edgar/data/1786874/000182912620000176/primary_...,4B0DB6BA9BC23EFC,H849z2+LMGlRCJ1qNrp7pNuzeWF12xgz5UCHFQOWXoT4zm...


In [9]:
all_files = all_files_parse(cik)
all_files['1-A/A']

1-SA
253G2
QUALIF
CORRESP
1-A/A
UPLOAD
C-AR
1-A
C-U
C/A
C


Unnamed: 0,html,body,edgarsubmission,headerdata,submissiontype,filerinfo,livetestflag,filer,issuercredentials,cik,...,issuejuridicationsecuritiesoffering,dealersjuridicationsecuritiesoffering,securitiesissued,securitiesissuername,securitiesissuertitle,securitiesissuedtotalamount,securitiesprincipalholderamount,securitiesissuedaggregateamount,unregisteredsecuritiesact,securitiesactexcemption
0,,,,,1-A/A,,LIVE,,,1786874,...,AL,AL,,"Called Higher Studios, Inc.",Class A Voting Common Stock,1074993,0,"$1,006,248 at $1.00 per share.",,Regulation Crowdfunding
0,,,,,1-A/A,,LIVE,,,1786874,...,AL,AL,,"Called Higher Studios, Inc.",Class A Voting Common Stock,1074993,0,"$1,006,248 at $1.00 per share.",,Regulation Crowdfunding
0,,,,,1-A/A,,LIVE,,,1786874,...,AL,AL,,"Called Higher Studios, Inc.",Class A Voting Common Stock,1074993,0,"$1,006,248 at $1.00 per share.",,Regulation Crowdfunding


In [10]:
all_files['C/A']


Unnamed: 0,html,body,edgarsubmission,headerdata,submissiontype,filerinfo,filer,filercredentials,filercik,filerccc,...,issuejurisdictionsecuritiesoffering,signatureinfo,issuersignature,issuer,issuertitle,signaturepersons,signatureperson,personsignature,persontitle,signaturedate
0,,,,,C/A,,,,1786874,XXXXXXXX,...,AL,,,"Called Higher Studios, Inc.","CEO, Principal Executive Officer and Director,...",,,Jason Brown,"CEO, Principal Executive Officer and Director,...",01-29-2020


In [11]:
# converts dictionary of dataframes to json-like object

import json
json_dict = {}

for i in all_files:
    iter_json = all_files[i].to_json(orient= 'split')
    json_dict[i] = iter_json


In [12]:
# serializes dictionary
with open("higher_studios.json", "w") as outfile:  
    json.dump(json_dict, outfile)