In [91]:
import requests as req

In [92]:
def download_files():
    """Downloads all index files from EDGAR into the current directory"""
    
    for year in range(1993, 2019):
        for quarter in range(1, 5):
            url = 'https://www.sec.gov/Archives/edgar/full-index/' + str(year) + '/QTR' + str(quarter) + '/company.idx'
            r = req.get(url, allow_redirects=True)
            with open('Index-' + str(year) + '-' + str(quarter) + '.txt', "wb") as f:
                f.write(r.content)

In [93]:
import os

def get_reports(company, startyear, startquarter, endyear, endquarter):
    """
    Reads all index files and returns a generator of URLs of all 10-K/10-Qs of the given company.
    
    Parameters:
    company (str): company name exactly as listed in the index file
    startyear (int): the year to start looking for reports, inclusive
    startquarter (int): the quarter in startyear to start looking for reports, inclusive
    endyear (int): the year to stop looking for reports, inclusive
    endquarter(int): the quarter in endyear to stop looking for reports, inclusive
    
    Returns:
    Generator: each element is a tuple with the form (year of report, quarter of report, url of report)
    
    """
    
    files = filter(lambda x: x[-4:] == '.txt', os.listdir('.'))
    for file in files:
        year = int(file[6:10])
        if year < startyear or year > endyear:
            continue
        quarter = int(file[11])
        if (year == startyear and quarter < startquarter) or (year == endyear and quarter > endquarter):
            continue
        txt = open(file, 'r')
        for line in txt:
            if line.startswith(company) and ('10-K' in line or '10-Q' in line):
                url_index = line.index('edgar/')
                url = line[url_index:].rstrip('\n').strip()
                yield (year, quarter, 'https://www.sec.gov/Archives/' + url)

In [94]:
from bs4 import BeautifulSoup
import csv
import pandas as pd

def make_tables(text, limit, searchterms):
    """
    Parses HTML text and returns a generator of all tables as DataFrames
    
    Parameters:
    text (str): the text of the company report
    limit (int): the max number of tables to search through in the report
    searchterms (list of str): the terms to filter each table for. Only rows with one of the terms
        somewhere in the row label will be added to the amalgamated table.                       
    
    Returns:
    DataFrame: a table combining all rows in all examined tables in the report that contained one
        of the search terms in its row label
    
    """
    
    soup = BeautifulSoup(text)
    tables = soup.find_all("table")
    row_list = []
    indices = []
    table_num = 0
    for table in tables:
        for row in table.find_all('tr'):
            data_row = []
            columns = row.find_all('td')
            for column in columns:
                cell_text = column.get_text().replace('\n','')
                data_row.append(cell_text)             
            if len(data_row) > 0:
                #print(data_row)
                if any(term in data_row[0] for term in searchterms):
                    #print(data_row)
                    indices.append(data_row[0])
                    row_list.append(data_row[1:])
        table_num += 1
        if table_num >= limit: # TODO: remove in final version
            break
    return pd.DataFrame(row_list, index=indices)

In [95]:
def get_html_tables(company, startyear, startquarter=1, endyear=2018, endquarter=4, limit=12, searchterms=['revenue','profit','income']):
    """
    Forms tables of the desired search terms of the given company in the given timeframe
    
    Parameters:
    See documentation of get_reports and make_tables.
    
    Returns:
    Generator: each element is a tuple with the form (year, quarter, amalgamated table)
    
    """
    
    reports = get_reports(company, startyear, startquarter, endyear, endquarter)
    for report in reports:
        text = req.get(report[2]).text
        if '<html' in text or '<HTML' in text:
            data = make_tables(text, limit, searchterms)
            yield (report[0], report[1], data)

In [96]:
tables = get_html_tables("INTERNATIONAL BUSINESS MACHINE", 2018)
for item in tables:
    print('Year ', item[0], ' - Quarter ', item[1])
    display(item[2])

Year  2018  - Quarter  1


Year  2018  - Quarter  2


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Total revenue,,19072,,18155,,,,,,,,,,,,,
Gross profit,,8247,,7944,*,,,,,,,,,,,,
Expense and other (income):,,,,,,,,,,,,,,,,,
Intellectual property and custom development income,,(317,),(445,),,,,,,,,,,,,
Other (income) and expense,,413,,319,*,,,,,,,,,,,,
Total expense and other (income),,7111,,6521,*,,,,,,,,,,,,
Income from continuing operations before income taxes,,1136,,1424,,,,,,,,,,,,,
Provision for/(benefit from) income taxes,,(540,),(329,),,,,,,,,,,,,
Net income,,$,1679,,$,1750.0,,,,,,,,,,,
Net income,,$,1679,,$,1750.0,,,,,,,,,,,


Year  2018  - Quarter  3


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Total revenue,,20003,,19289,,39075,,37443,,,,,,,,,
Gross profit,,9199,,8968,*,17445,,16912,*,,,,,,,,
Expense and other (income):,,,,,,,,,,,,,,,,,
Intellectual property and custom development income,,(250,),(365,),(567,),(810,),,,,,,,,
Other (income) and expense,,280,,273,*,692,,592,*,,,,,,,,
Total expense and other (income),,6423,,6525,*,13534,,13046,*,,,,,,,,
Income from continuing operations before income taxes,,2776,,2443,,3911,,3867,,,,,,,,,
Provision for/(benefit from) income taxes,,373,,111,,(166,),(218,),,,,,,,,
Net income,,$,2404,,$,2331,,$,4083,,$,4082.0,,,,,
Net income,,$,2404,,$,2331,,$,4083,,$,4082.0,,,,,


Year  2018  - Quarter  4


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Total revenue,,18756,,19153,,57830,,56597,,,,,,,,,
Gross profit,,8803,,8981,*,26249,,25894,*,,,,,,,,
Expense and other (income):,,,,,,,,,,,,,,,,,
Intellectual property and custom development income,,(275,),(308,),(842,),"(1,118",),,,,,,,,
Other (income) and expense,,275,,159,*,968,,751,*,,,,,,,,
Total expense and other (income),,5807,,5917,*,19341,,18962,*,,,,,,,,
Income from continuing operations before income taxes,,2996,,3065,,6908,,6931,,,,,,,,,
Provision for income taxes,,304,,339,,138,,120,,,,,,,,,
Net income,,$,2694,,$,2726,,$,6777,,$,6807.0,,,,,
Net income,,$,2694,,$,2726,,$,6777,,$,6807.0,,,,,
