In [9]:
import requests
import io
import gzip
import pandas as pd
import csv
from bs4 import BeautifulSoup
import lxml
from collections import OrderedDict

parent_dir_link = 'https://www.sec.gov/Archives/edgar/full-index/'
archive_link = 'https://www.sec.gov/Archives/'
begYear = 2020
json = 'index.json'
xbrl = 'xbrl.gz'

In [10]:
#Getting links for each year directory
parent_dir = requests.get(parent_dir_link+json)
year_dirs = parent_dir.json()['directory']['item']
year_dir_links = []
for child_elem in year_dirs:
    if child_elem['type']=='dir':
        if int(child_elem['name'])>=begYear:
            year_dir_links.append(parent_dir_link+child_elem['name'])

In [11]:
#Getting links for each quarter directory for each year director
quarterly_dir_links = []
for year in year_dir_links:
    year_dir = requests.get(year +'/'+json).json()['directory']['item']
    for quarter in year_dir:
        quarterly_dir_links.append(year+'/'+quarter['name'])

In [12]:
#Getting links to all fillings made with xbrl format in each quarter
xbrl_dir_links = []
for link in quarterly_dir_links:
    file = str(gzip.GzipFile(fileobj=io.BytesIO(requests.get(link +'/'+xbrl).content)).read()).replace("\\n","\n").replace('"','')
    xbrl_dir_links.append(file[file.rfind("--")+3:])

In [13]:
df0 = pd.DataFrame(columns=['CIK','Company Name','Form Type','Date Filed','Filename'])

for link in xbrl_dir_links:
    df = pd.read_csv(io.StringIO(link),sep="|",names=['CIK','Company Name','Form Type','Date Filed','Filename'])
    df2 = df[df['Form Type'].str.contains('10-K') | df['Form Type'].str.contains('10-Q')].reset_index(drop=True)
    df0 = df0.append(df2)
df4 = df0
df0

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename
0,1000045,NICHOLAS FINANCIAL INC,10-Q,2020-02-14,edgar/data/1000045/0001564590-20-004703.txt
1,1000209,MEDALLION FINANCIAL CORP,10-K,2020-03-30,edgar/data/1000209/0001564590-20-014310.txt
2,1000228,HENRY SCHEIN INC,10-K,2020-02-20,edgar/data/1000228/0001000228-20-000018.txt
3,1000229,CORE LABORATORIES N V,10-K,2020-02-10,edgar/data/1000229/0001564590-20-004075.txt
4,1000230,OPTICAL CABLE CORP,10-K,2020-01-27,edgar/data/1000230/0001437749-20-001224.txt
...,...,...,...,...,...
501,96223,Jefferies Financial Group Inc.,10-Q,2020-10-09,edgar/data/96223/0000096223-20-000050.txt
502,97476,TEXAS INSTRUMENTS INC,10-Q,2020-10-21,edgar/data/97476/0001628280-20-014630.txt
503,98338,TSR INC,10-Q,2020-10-13,edgar/data/98338/0001213900-20-031097.txt
504,99780,TRINITY INDUSTRIES INC,10-Q,2020-10-26,edgar/data/99780/0000099780-20-000136.txt


In [17]:
# Working with sandisk case first before looping 
i = 0
for index,row in df0.iterrows():
    CIK = row['CIK']
    CompanyName = row['Company Name']
    FormType = row['Form Type']
    DateFiled = row['Date Filed']
    Filename = row['Filename']
    sandisk_filing = archive_link+Filename.replace('-','').replace('.txt','')+'/'+json
    for file in requests.get(sandisk_filing).json()['directory']['item']:
        if file['name']== 'FilingSummary.xml':
            xml_summary = archive_link+Filename.replace('-','').replace('.txt','')+'/'+file['name']
    xml_summary

    content = requests.get(xml_summary).content
    soup = BeautifulSoup(content,'lxml')
    reports = soup.find('myreports').find_all('report')

    financial_statement_names = ['consolidated balance sheets','balance sheet','condensed consolidated balance sheets','consolidated statements of financial condition']

    financial_statement_links = []
    for report in reports:
        if report.shortname.text.lower() in financial_statement_names:
            financial_statement_links.append(archive_link+Filename.replace('-','').replace('.txt','')+'/'+report.htmlfilename.text)
    if financial_statement_links:
        bs_link = financial_statement_links[0]
        bs_link

        content = requests.get(bs_link).content
        soup = BeautifulSoup(content)
        data = OrderedDict()
        found_header = False
        years = []
        filing_labels = []
        gaap_label = ''
        table = soup.find('table')
        for row in table.find_all('tr'):
            if not found_header:
                found_title = False
                for header in row.find_all('th'):
                    if not found_title:
                        title = header.text.replace('\n','')
                        found_title = True
                        continue
                    years.append(header.text.replace('\n',''))
                found_header = True
                continue
            data['Date'] = years
            values = []
            found_filing_label = False
            for col in row.find_all('td'):
                if not found_filing_label:
                    found_filing_label = True
                    continue
                value = ''
                if col.find('a'):
                    gaap_label = col.find('span').text.replace('us-gaap_','')
                    value = col.find('a').text
                values.append(value)
                data[gaap_label] = values
        try:
            df3 = pd.DataFrame.from_dict(data)
        except ValueError:
            continue
            pass
        df3['CIK'] = CIK
        df3['Company Name'] = CompanyName
        df3['Form Type'] = FormType
        df3['Date Filed'] = DateFiled
        df4 = pd.merge(df4,df3,how='outer')
    i+=1
    if i%10 == 0:
        break