In [31]:
import requests as req
import pandas as pd
from bs4 import BeautifulSoup
import csv
import re

In [32]:
def download_files():
    """Downloads all index files from EDGAR into the current directory"""
    
    for year in range(1993, 2019):
        for quarter in range(1, 5):
            url = 'https://www.sec.gov/Archives/edgar/full-index/' + str(year) + '/QTR' + str(quarter) + '/company.idx'
            r = req.get(url, allow_redirects=True)
            with open('Index-' + str(year) + '-' + str(quarter) + '.txt', "wb") as f:
                f.write(r.content)

In [33]:
import os

def get_reports(company_names, startyear, startquarter, endyear, endquarter):
    """
    Reads all index files and returns a generator of URLs of all 10-K/10-Qs of the given company.
    
    Parameters:
    company_names (str): list of names for the company exactly as listed in the index files
    startyear (int): the year to start looking for reports, inclusive
    startquarter (int): the quarter in startyear to start looking for reports, inclusive
    endyear (int): the year to stop looking for reports, inclusive
    endquarter(int): the quarter in endyear to stop looking for reports, inclusive
    
    Returns:
    Generator: each element is a tuple with the form (year of report, quarter of report, url of report)
    
    """
    
    files = filter(lambda x: x[-4:] == '.txt', os.listdir('.'))
    for file in files:
        year = int(file[6:10])
        if year < startyear or year > endyear:
            continue
        quarter = int(file[11])
        if (year == startyear and quarter < startquarter) or (year == endyear and quarter > endquarter):
            continue
        txt = open(file, 'r')
        for line in txt:
            if any(line.lower().startswith(name.lower()) for name in company_names) and '10-Q ' in line: 
                url_index = line.index('edgar/')
                url = line[url_index:].rstrip('\n').strip()
                yield (year, quarter, 'https://www.sec.gov/Archives/' + url)

In [38]:
def make_tables(text, limit, searchterms, year, quarter):
    """
    Parses HTML text and returns a generator of all tables as DataFrames
    
    Parameters:
    text (str): the text of the company report
    limit (int): the max number of tables to search through in the report
    searchterms (list of str): the terms to filter each table for. Only rows with one of the terms
        somewhere in the row label will be added to the amalgamated table. Only the first row for
        net income will be added, as "net income" is mentioned many times in different contexts.
    year (int): the year the document corresponds to
    quarter(int): the quarter the document corresponds to
    
    Returns:
    DataFrame: a table combining all rows in all examined tables in the report that contained one
        of the search terms in its row label
    
    """
    regex = re.compile('[^a-z A-Z]')
    soup = BeautifulSoup(text)
    tables = soup.find_all("table")
    row_list = []
    indices = []
    titles_added = [] # Represents the titles of rows that were added. Rows with duplicate titles will not be added.
    net_income_added = False
    table_num = 0
    for table in tables:
        added_rows = False
        for row in table.find_all('tr'):
            data_row = []
            columns = row.find_all('td')
            for column in columns:
                cell_text = column.get_text().replace('\n','')
                data_row.append(cell_text)             
            if len(data_row) > 0:
                
                row_label = regex.sub('', data_row[0].strip()).strip()
                if any(term in data_row[0] for term in searchterms) and row_label not in titles_added: # TODO: re-add .lower()?
                    titles_added.append(row_label)
                    #print(titles_added)
                    added_rows = True
                    #indices.append(re.sub(r'([^\s\w]|_)+', '', data_row[0].strip()))
                    indices.append(row_label)
                    row_list.append(data_row[1:])
        if added_rows:
            table_num += 1
        if table_num >= limit: #len(titles_added) >= len(searchterms) or table_num >= limit:
            break
    dataframe = reformat_table(pd.DataFrame(row_list, index=indices), year, quarter)
    return dataframe

In [35]:
def reformat_table(df, year, quarter):
    """
    Trims tables down to only the desired numerical information
    
    Parameters:
    df (DataFrame): the table to be trimmed
    year (int): the year the table corresponds to
    quarter(int): the quarter the table corresponds to
    
    Returns:
    DataFrame: the table after extraneous cells have been removed and year/quarter cells have been added
    
    """
    
#     # Remove all columns other than the first column of numbers
#     useless_columns = []
#     for col in range(len(df.columns)):
#         values_found = False
#         for row in range(len(df.index)):
#             #print('col', col, 'row', row, df.iloc[row, col])
#             if ',' in df.iloc[row, col]:
#                 values_found = True
#                 break
#         if not values_found:
#             useless_columns.append(col)
#         else:
#             useless_columns += list(range(col + 1, len(df.columns)))
#             break        
#     for col in useless_columns:
#         del df[col]
#     df.columns = ['millions of $, except when specified']

#     # Remove all rows that are blank
#     useless_rows = []
#     for row in range(len(df.index)):
#         #print(repr(df.index[row]))
#         if len(df.iloc[row, 0]) == 0:
#             useless_rows.append(row)
#     df.drop(index=[df.index[row] for row in useless_rows], inplace=True)

    data_column = []
    #display(df)
    for row in range(len(df.index)):
        #print(df.iloc[row])
        for col in range(len(df.columns)):
            cell_value = df.iloc[row, col]
            if any(char.isdigit() for char in cell_value):
                for punctuation in [',', '(', ')']:
                    cell_value = cell_value.replace(punctuation, '')
                data_column.append(float(cell_value))
                break

#     # Change all string representations of numbers to floating point values
#     for row in range(len(df.index)):
#         #print(df.iloc[row,0])
#         number = df.iloc[row, 0]
#         for char in [',', '(', ')']:
#             number = number.replace(char, '')
#         #df.iloc[row, 0] = float(number)
    data_column = [year, quarter] + data_column
    new_index = ['Year', 'Quarter'] + df.index.tolist()
    return pd.DataFrame(data=data_column, index=new_index, columns=[0])

In [36]:
def get_html_tables(company_names, startyear, startquarter=1, endyear=2018, endquarter=4, limit=3, searchterms=['revenue','gross profit','net income','earning','cash']):
    """
    Forms tables of the desired search terms of the given company in the given timeframe
    
    Parameters:
    See documentation of get_reports and make_tables.
    
    Returns:
    Generator: each element is a tuple with the form (year, quarter, amalgamated table)
    
    """
    
    reports = get_reports(company_names, startyear, startquarter, endyear, endquarter)
    for report in reports:
        year = report[0]
        quarter = report[1]
        text = req.get(report[2]).text
        if '<html' in text or '<HTML' in text:
            #print(report[0], 'Q', report[1])
            data = make_tables(text, limit, searchterms, year, quarter)
            yield (report[0], report[1], data)

In [40]:
searchterms_apple = ['Net sales', 'Cost of sales', 'Total operating expenses', 'Net income', 
                     'Diluted', 'Total current assets', 'Total assets', 'Total current liabilities', 
                     'Total liabilities', 'Total shareholders equity', 'Cash and cash equivalents, beginning', 
                     'Depreciation', 'Cash generated by operating activities', 'Cash and cash equivalents, end']
reports_apple = get_html_tables(['apple inc'], startyear=2008, startquarter=1, endyear=2018, endquarter=3, searchterms=searchterms_apple)

combined_df = pd.DataFrame()

for report in reports_apple:
    df = report[2].copy()
    year = report[0]
    quarter = report[1]
    for row in range(len(df.index)):
        if 'diluted' in df.index[row].lower():
            break
    newindex = df.index.tolist()
    newindex[row] = 'Diluted earnings/common share ($)'
    newindex[row + 1] = 'Diluted number of shares (thousands)'
    for row in range(len(df.index)):
        if 'depreciation' in df.index[row].lower():
            break
    newindex[row] = 'Depreciation and Amortization'
    df.index = newindex
    transposed_df = pd.DataFrame(data=[df.iloc[:, 0]], index=[str(year) + '-' + str(quarter)], columns=df.index)
    combined_df = combined_df.append(transposed_df)
    
display(combined_df)

Unnamed: 0,Year,Quarter,Net sales,Cost of sales,Total operating expenses,Net income,Diluted earnings/common share ($),Diluted number of shares (thousands),Total assets,Total current liabilities,Total liabilities,Total liabilities and shareholders equity,Cash and cash equivalents beginning of the period,Depreciation and Amortization,Cash generated by operating activities,Cash and cash equivalents end of the period
2008-1,2008.0,1.0,9608.0,6276.0,1206.0,1581.0,1.76,26189.0,30039.0,10535.0,13235.0,30039.0,9352.0,106.0,2787.0,9162.0
2008-2,2008.0,2.0,7512.0,5038.0,1159.0,1045.0,1.16,26736.0,30471.0,9634.0,12418.0,30471.0,9352.0,222.0,3980.0,9070.0
2008-3,2008.0,3.0,7464.0,4864.0,1208.0,1072.0,1.19,27998.0,31709.0,9218.0,12087.0,31709.0,9352.0,339.0,5301.0,9373.0
2009-1,2009.0,1.0,10167.0,6635.0,1406.0,1605.0,1.78,35163.0,42787.0,14757.0,19878.0,42787.0,11875.0,158.0,3938.0,7236.0
2009-2,2009.0,2.0,8163.0,5192.0,1304.0,1205.0,1.33,33853.0,43237.0,13751.0,18926.0,43237.0,11875.0,330.0,4779.0,4466.0
2009-3,2009.0,3.0,8337.0,5314.0,1351.0,1229.0,1.35,35170.0,48140.0,16661.0,22252.0,48140.0,11875.0,506.0,7049.0,5605.0
2010-1,2010.0,1.0,15683.0,9272.0,1686.0,3378.0,3.67,33332.0,53926.0,13097.0,18158.0,53926.0,5263.0,209.0,5781.0,7609.0
2010-2,2010.0,2.0,13499.0,7874.0,1646.0,3074.0,3.33,32336.0,57057.0,12229.0,17709.0,57057.0,5263.0,425.0,8111.0,10018.0
2010-3,2010.0,3.0,15700.0,9564.0,1902.0,3253.0,3.51,36033.0,64725.0,15612.0,21614.0,64725.0,5263.0,698.0,12912.0,9705.0
2011-1,2011.0,1.0,26741.0,16443.0,2471.0,6004.0,6.43,43927.0,86742.0,23795.0,32076.0,86742.0,11261.0,356.0,9773.0,10734.0
