In [1]:
import requests as req
import urllib
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
# function to form urls easily
def url_construct(base_url, comp):
    url = base_url
    for r in comp:
        url = '{}/{}'.format(url, r)
    return url

# EXAMPLE
base_url = r"https://www.sec.gov/Archives/edgar/data"
components = ['886982','000156459019011378', '0001564590-19-011378-index-headers.html']
url_construct(base_url, components)

'https://www.sec.gov/Archives/edgar/data/886982/000156459019011378/0001564590-19-011378-index-headers.html'

In [3]:
base_url = r"https://www.sec.gov/Archives/edgar/daily-index"
year_url = url_construct(base_url, ['2020', 'index.json'])

content_raw = req.get(year_url)
content_decode = content_raw.json()

url_list = []
for item in content_decode['directory']['item']:
    qtr_url = url_construct(base_url, ['2020', item['name'], 'index.json'])
    file_content = req.get(qtr_url)
    decode = file_content.json()
    
    for file in decode['directory']['item']:
        file_url = url_construct(base_url, ['2020', item['name'], file['name']])
        url_list.append(file_url)
len(url_list)

1125

In [4]:
url_list

['https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200102.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200103.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200106.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200107.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200108.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200109.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200110.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200113.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200114.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200115.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200116.idx',
 'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200117.idx',
 'https://www.se

In [12]:
# Test with one file
file_url = r"https://www.sec.gov/Archives/edgar/daily-index/2019/QTR2/master.20190401.idx"


content = req.get(file_url).content

# write to a file so we don't have to pull it again
with open(r'data/master_20190102.txt', 'wb') as f:
     f.write(content)


In [13]:
with open(r'data/master_20190102.txt', 'rb') as f:
     byte_data = f.read()

# decode the byte data
data = byte_data.decode('utf-8').split('  ')

#finding the starting index
for index, item in enumerate(data):
    if 'ftp://ftp.sec.gov/edgar/\n' in item:
        start_ind = index

# create clean list
data_format = data[start_ind + 1:]
master_data = []

for index, item in enumerate(data_format):
    if index == 0:
        clean_item_data = item.replace('\n', '|').split('|')
        clean_item_data = clean_item_data[8:]
    else:
        clean_item_data = item.replace('\n', '|').split('|')
    
    for index, row in enumerate(clean_item_data):
        if '.txt' in row:
            mini_list = clean_item_data[(index-4) : (index + 1)] # where you find the .txt file you know the cik is at index - 4
            if len(mini_list) != 0:
                mini_list[4] = 'https://www.sec.gov/Archives/' + mini_list[4]
                master_data.append(mini_list)


In [14]:
for index, document in enumerate(master_data):
    doc_dict = {}
    doc_dict['cik'] = document[0]
    doc_dict['company_name'] = document[1]
    doc_dict['form_id'] = document[2]
    doc_dict['date'] = document[3]
    doc_dict['file_url'] = document[4]
    master_data[index] = doc_dict

In [15]:
for doc_dict in master_data:
    if doc_dict['form_id'] == 'C':
        print(f"{doc_dict['company_name']} - {doc_dict['file_url']}")

Arygin Corp - https://www.sec.gov/Archives/edgar/data/1769013/0001665160-19-000317.txt
Shira Productions, LLC - https://www.sec.gov/Archives/edgar/data/1771389/0001668287-19-000009.txt


In [16]:
master_data_df = pd.DataFrame(master_data)

In [17]:
master_data_df

Unnamed: 0,cik,company_name,form_id,date,file_url
0,1236397,BRADBURY DANIEL,4,20190401,https://www.sec.gov/Archives/edgar/data/123639...
1,1236458,WILLIAMS PAUL S,4,20190401,https://www.sec.gov/Archives/edgar/data/123645...
2,1237789,BLAIR DONALD W,4,20190401,https://www.sec.gov/Archives/edgar/data/123778...
3,1237831,GLOBUS MEDICAL INC,4,20190401,https://www.sec.gov/Archives/edgar/data/123783...
4,1238028,TALOR EYAL,4,20190401,https://www.sec.gov/Archives/edgar/data/123802...
...,...,...,...,...,...
4244,9631,BANK OF NOVA SCOTIA,FWP,20190401,https://www.sec.gov/Archives/edgar/data/9631/0...
4245,97517,TEXAS PACIFIC LAND TRUST,4,20190401,https://www.sec.gov/Archives/edgar/data/97517/...
4246,98362,TIMKEN CO,8-K,20190401,https://www.sec.gov/Archives/edgar/data/98362/...
4247,99106,TRANS LUX Corp,NT 10-K,20190401,https://www.sec.gov/Archives/edgar/data/99106/...


In [18]:
master_data_df[master_data_df['form_id']== 'D']

Unnamed: 0,cik,company_name,form_id,date,file_url
616,1408146,SCANDIUM INTERNATIONAL MINING CORP.,D,20190401,https://www.sec.gov/Archives/edgar/data/140814...
644,1411912,Riverside Resources Inc,D,20190401,https://www.sec.gov/Archives/edgar/data/141191...
720,1420031,"EMMAUS LIFE SCIENCES, INC.",D,20190401,https://www.sec.gov/Archives/edgar/data/142003...
848,1448597,Bullfrog Gold Corp.,D,20190401,https://www.sec.gov/Archives/edgar/data/144859...
926,1472601,RYU APPAREL INC.,D,20190401,https://www.sec.gov/Archives/edgar/data/147260...
...,...,...,...,...,...
2942,1772412,"DotCom Therapy, Inc.",D,20190401,https://www.sec.gov/Archives/edgar/data/177241...
2943,1772445,"Tioga Medical, Inc.",D,20190401,https://www.sec.gov/Archives/edgar/data/177244...
2944,1772448,"jCyte, Inc.",D,20190401,https://www.sec.gov/Archives/edgar/data/177244...
2945,1772452,Conversocial Ltd,D,20190401,https://www.sec.gov/Archives/edgar/data/177245...
