In [1]:
import requests as req

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import csv

In [3]:
def download_files():
    """Downloads all index files from EDGAR into the current directory"""
    
    for year in range(1993, 2019):
        for quarter in range(1, 5):
            url = 'https://www.sec.gov/Archives/edgar/full-index/' + str(year) + '/QTR' + str(quarter) + '/company.idx'
            r = req.get(url, allow_redirects=True)
            with open('Index-' + str(year) + '-' + str(quarter) + '.txt', "wb") as f:
                f.write(r.content)

In [4]:
import os

def get_reports(company_names, startyear, startquarter, endyear, endquarter):
    """
    Reads all index files and returns a generator of URLs of all 10-K/10-Qs of the given company.
    
    Parameters:
    company_names (str): list of names for the company exactly as listed in the index files
    startyear (int): the year to start looking for reports, inclusive
    startquarter (int): the quarter in startyear to start looking for reports, inclusive
    endyear (int): the year to stop looking for reports, inclusive
    endquarter(int): the quarter in endyear to stop looking for reports, inclusive
    
    Returns:
    Generator: each element is a tuple with the form (year of report, quarter of report, url of report)
    
    """
    
    files = filter(lambda x: x[-4:] == '.txt', os.listdir('.'))
    for file in files:
        year = int(file[6:10])
        if year < startyear or year > endyear:
            continue
        quarter = int(file[11])
        if (year == startyear and quarter < startquarter) or (year == endyear and quarter > endquarter):
            continue
        txt = open(file, 'r')
        for line in txt:
            if any(line.startswith(name) for name in company_names) and '10-Q' in line: # TODO: Include 10-Ks?
                url_index = line.index('edgar/')
                url = line[url_index:].rstrip('\n').strip()
                yield (year, quarter, 'https://www.sec.gov/Archives/' + url)

In [5]:
def make_tables(text, limit, searchterms):
    """
    Parses HTML text and returns a generator of all tables as DataFrames
    
    Parameters:
    text (str): the text of the company report
    limit (int): the max number of tables to search through in the report
    searchterms (list of str): the terms to filter each table for. Only rows with one of the terms
        somewhere in the row label will be added to the amalgamated table.                       
    
    Returns:
    DataFrame: a table combining all rows in all examined tables in the report that contained one
        of the search terms in its row label
    
    """
    
    soup = BeautifulSoup(text)
    tables = soup.find_all("table")
    row_list = []
    indices = []
    titles_added = [] # Represents the titles of rows that were added. Rows with duplicate titles will not be added.
    table_num = 0
    for table in tables:
        added_rows = False
        for row in table.find_all('tr'):
            data_row = []
            columns = row.find_all('td')
            for column in columns:
                cell_text = column.get_text().replace('\n','')
                data_row.append(cell_text)             
            if len(data_row) > 0:
                #print(data_row)
                if any(term in data_row[0].lower() for term in searchterms) and data_row[0] not in titles_added:
                    titles_added.append(data_row[0])
                    added_rows = True
                    #print(data_row)
                    indices.append(data_row[0])
                    row_list.append(data_row[1:])
        if added_rows:
            table_num += 1
        if len(titles_added) >= len(searchterms) or table_num >= limit: # TODO: remove in final version
            break
    return pd.DataFrame(row_list, index=indices)

In [6]:
def get_html_tables(company_names, startyear, startquarter=1, endyear=2018, endquarter=4, limit=12, searchterms=['revenue','profit','income','earning','cash']):
    """
    Forms tables of the desired search terms of the given company in the given timeframe
    
    Parameters:
    See documentation of get_reports and make_tables.
    
    Returns:
    Generator: each element is a tuple with the form (year, quarter, amalgamated table)
    
    """
    
    reports = get_reports(company_names, startyear, startquarter, endyear, endquarter)
    for report in reports:
        text = req.get(report[2]).text
        if '<html' in text or '<HTML' in text:
            data = make_tables(text, limit, searchterms)
            yield (report[0], report[1], data)

In [7]:
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

# starting: ibm = 2003, verizon = 2002, comcast = 2004, intel = 2001, amazon = 2001
# columns = 'year', 'quarter', 'total revenue', 'total cost', 'net income', 'cash dividends/share', 'total assets', 'total liabilities'
data = pd.read_csv('testtable.csv', index_col=False, header=None)
training = data.drop(5, axis=1)
training.drop(training.tail(1).index, inplace=True)
training.columns = ['year', 'quarter', 'total revenue', 'total cost', 'net income', 'total assets', 'total liabilities']
labels = np.ravel(data[[5]])
labels = labels[:-1]
testing = pd.DataFrame([[2018,3,18756,9953,2694,121990,102071]])
testing.columns = ['year', 'quarter', 'total revenue', 'total cost', 'net income', 'total assets', 'total liabilities']

display(labels)
print("Training data:")
display(training)

array([0.16, 0.16, 0.18, 0.18, 0.18, 0.2 , 0.2 , 0.2 , 0.3 , 0.3 , 0.3 ,
       0.4 , 0.4 , 0.4 , 0.5 , 0.5 , 0.5 , 0.55, 0.55, 0.55, 0.65, 0.65,
       0.65, 0.75, 0.75, 0.75, 0.85, 0.85, 0.85, 0.95, 0.95, 0.95, 1.1 ,
       1.1 , 1.3 , 1.3 , 1.3 , 1.4 , 1.4 , 1.4 , 1.5 , 1.5 , 1.5 , 1.57])

Training data:


Unnamed: 0,year,quarter,total revenue,total cost,net income,total assets,total liabilities
0,2003,3,21522,13710,1785,97190,69864
1,2004,1,22250,14241,1602,101825,73640
2,2004,2,23153,14628,1988,99582,70752
3,2004,3,23429,14783,1800,100676,70974
4,2005,1,22908,14654,1402,104899,74979
5,2005,2,22270,13495,1829,103388,73263
6,2005,3,21529,12791,1516,101009,70774
7,2006,1,20659,12571,1708,102468,69737
8,2006,2,21890,12876,2022,103377,69828
9,2006,3,22617,13126,2222,104155,69907


In [8]:
print("Testing data")
display(testing)
print("Target label (cash dividends/share in $): 1.57")

regr = Ridge(random_state=0, alpha=0.5)
regr.fit(training, labels)
prediction = regr.predict(testing)
print('Ridge regressor prediction: ', round(prediction[0], 4))

regr = Lasso(random_state=0, alpha=0.5)
regr.fit(training, labels)
prediction = regr.predict(testing)
print('Lasso regressor prediction: ', round(prediction[0], 4))

regr = DecisionTreeRegressor(random_state=0, max_depth=5)
regr.fit(training, labels)
prediction = regr.predict(testing)
print('Decision tree regressor prediction: ', round(prediction[0], 4))

regr = RandomForestRegressor(random_state=0, n_estimators=50)
regr.fit(training, labels)
prediction = regr.predict(testing)
print('Random forest regressor prediction: ', round(prediction[0], 4))

regr = BaggingRegressor(random_state=0, n_estimators=50)
regr.fit(training, labels)
prediction = regr.predict(testing)
print('Bagging regressor prediction: ', round(prediction[0], 4))

Testing data


Unnamed: 0,year,quarter,total revenue,total cost,net income,total assets,total liabilities
0,2018,3,18756,9953,2694,121990,102071


Target label (cash dividends/share in $): 1.57
Ridge regressor prediction:  1.6246
Lasso regressor prediction:  1.5281
Decision tree regressor prediction:  1.5
Random forest regressor prediction:  1.471
Bagging regressor prediction:  1.4704
