In [1]:
import requests as req
import pandas as pd
from bs4 import BeautifulSoup
import csv
import re
import itertools as iter
import copy
import time
from alpha_vantage.timeseries import TimeSeries

In [2]:
def download_files():
    """Downloads all index files from EDGAR into the current directory"""
    
    for year in range(1993, 2019):
        for quarter in range(1, 5):
            url = 'https://www.sec.gov/Archives/edgar/full-index/' + str(year) + '/QTR' + str(quarter) + '/company.idx'
            r = req.get(url, allow_redirects=True)
            with open('Index-' + str(year) + '-' + str(quarter) + '.txt', "wb") as f:
                f.write(r.content)

In [3]:
import os

def get_reports(company_name, startyear, startquarter, endyear, endquarter):
    """
    Reads all index files and returns a generator of URLs of all 10-K/10-Qs of the given company.
    
    Parameters:
    company_name (str): name of the company 
    startyear (int): the year to start looking for reports, inclusive
    startquarter (int): the quarter in startyear to start looking for reports, inclusive
    endyear (int): the year to stop looking for reports, inclusive
    endquarter(int): the quarter in endyear to stop looking for reports, inclusive
    
    Returns:
    Generator: each element is a tuple with the form (year of report, quarter of report, url of report)
    
    """
    
    files = filter(lambda x: x[-4:] == '.txt', os.listdir('.'))
    for file in files:
        year = int(file[6:10])
        if year < startyear or year > endyear:
            continue
        quarter = int(file[11])
        if (year == startyear and quarter < startquarter) or (year == endyear and quarter > endquarter):
            continue
        txt = open(file, 'r')
        for line in txt:
            if line.lower().startswith(company_name.lower()) and '10-Q ' in line: 
                url_index = line.index('edgar/')
                url = line[url_index:].rstrip('\n').strip()
                yield (year, quarter, 'https://www.sec.gov/Archives/' + url)
                break

In [4]:
def make_tables(company_name, text, limit, search_terms, year, quarter, mapping, default_side, max_tables):
    """
    Parses HTML text and returns a DataFrame containing the desired data
    
    Parameters:
    company_name (str): the name of the company
    text (str): the text of the company report
    limit (int): the max number of tables to search through in the report
    search_terms (list of str): the terms to filter each table for. 
    year (int): the year the document corresponds to
    quarter(int): the quarter the document corresponds to
    mapping (dict): a dict with items of the form (term: [locations, amount])
        term (str): one of the search terms for which detailed location/amount information is required
        locations (list of int): location(s) of the targeted value in the row
        amount (int): the number of times to read in values of this term
    default_side (str): the default location of the number to keep on each row (left/right) if the label is not in mapping
    
    Returns:
    DataFrame: a table containing values in the document corresponding to each of the search terms
    
    """
    regex = re.compile('[^a-z A-Z]')
    soup = BeautifulSoup(text)
    tables = soup.find_all("table")
    column_list = ['Year', 'Quarter']
    data_values = [year, quarter]
    found_terms = [] # By default, only the first occurence of a term in a row label will be used
    table_num = 0
    diluted_found = False
    equity_found = False
    term_map = copy.deepcopy(mapping)
    
    for table in tables:
        #print(max_tables)
        max_tables -= 1
        if max_tables == 0:
            break
        added_rows = False
        for row in table.find_all('tr'):
            data_row = []
            columns = row.find_all('td')
            if(len(columns) == 0):
                continue
            #print([xc.get_text() for xc in columns])
            first_column = ''
            for col in columns:
                if len(col.get_text().strip()) > 0:
                    first_column = regex.sub('', col.get_text()) # TODO: Does using regex break anything?
                    break
            #if len(column_list) > 2:
                #print(year, quarter, ':', repr(first_column))
            row_label_tokens = first_column.split()
            row_label = ' '.join(row_label_tokens)
            row_label = regex.sub('', row_label).strip()
            #print(repr(row_label))
            #if 'diluted' in row_label.lower():
                #print(year, quarter, ':', repr(row_label))
            for term in search_terms:
                if term in row_label:
                    #print(repr(row_label))
                    if term in found_terms and term not in term_map:
                        break
                    if term == 'Net loss' and 'Net income' in column_list:
                        break
                    #print(repr(row_label))
                    row_values = []
                    for i in range(1, len(columns)):
                        cell_text = columns[i].get_text()
                        if any(char.isdigit() for char in cell_text) and not re.search('[a-zA-Z]', cell_text): # TODO: does the second part break anything?
                            if cell_text.strip()[0] == '(': # TODO: Keep making numbers in parentheses negative?
                                cell_text = '-' + cell_text
                            for punctuation in [',', '(', ')', '\n', '$']:
                                cell_text = cell_text.replace(punctuation, '')
                            row_values.append(cell_text)
                    if len(row_values) == 0:
                        #print(repr(row_label))
                        break
                    if 'diluted' in term.lower():
                        if diluted_found:
                            break
                        diluted_found = True
                    if 'equity' in term.lower():
                        if equity_found:
                            break
                        equity_found = True
                    found_terms.append(term)
                    if term in term_map:
                        term_amount = term_map[term][1]
                        if term_amount == 0:
                            break
                        column_label = term
                        term_start_amount = mapping[term][1]
                        if term_start_amount > 1:
                            column_label += str(term_start_amount - term_amount + 1)
                        positions = term_map[term][0]
                        for position in positions:
                            if len(positions) == 1:
                                column_list.append(column_label)
                            else:
                                column_list.append(column_label + ' [' + str(position) + ']')
                            data_values.append(float(row_values[position].strip()))
                            #print(row_values[position])
                        term_map[term][1] -= 1
                    else:
                        column_list.append(term)
                        if default_side == 'left':
                            data_values.append(float(row_values[0].strip()))
                        else:
                            data_values.append(float(row_values[-1].strip()))
                        #print(row_label)
                    added_rows = True
                    break
        
        if added_rows:
            table_num += 1
        if table_num >= limit:
            break
    #print(data_values)
    #print(year, quarter, ':', column_list)
    return pd.DataFrame([data_values], index=[company_name + ' ' + str(year) + '-' + str(quarter)], columns=column_list)

In [5]:
def get_html_tables(company_name, search_terms, startyear, startquarter=1, endyear=2018, endquarter=4, 
                    limit=3, mapping=dict(), default_side='left', max_tables=17):
    """
    Forms tables of the desired search terms of the given company in the given timeframe
    
    Parameters:
    See documentation of get_reports and make_tables.
    
    Returns:
    Generator: each element is a tuple with the form (year, quarter, amalgamated table)
    
    """
    
    reports = get_reports(company_name, startyear, startquarter, endyear, endquarter)
    for report in reports:
        year = report[0]
        quarter = report[1]
        text = req.get(report[2]).text
        if '<html' in text or '<HTML' in text:
            #print(report[0], 'Q', report[1])
            data = make_tables(company_name, text, limit, search_terms, year, quarter, mapping, default_side, max_tables)
            yield (year, quarter, data)

In [14]:
def get_nvidia_data():
    searchterms_nvidia = ['Revenue', 'Cost of revenue', 'Net income', 'Net loss', 'Diluted', 'Total current assets', 'Total assets', 
                          'Total current liabilities', 'Total stockholders equity', 'Total shareholders equity', 'Depreciation', 
                          'operating activities', 'cash equivalents at beginning', 'cash equivalents at end']
    reports_nvidia = get_html_tables('Nvidia', startyear=2008, startquarter=1, endyear=2018, endquarter=4, 
                                                max_tables=20, limit=5, search_terms=searchterms_nvidia)

    combined_df = pd.DataFrame()

    for report in reports_nvidia:
        year = report[0]
        quarter = report[1]
        df = report[2]
        new_columns = df.columns.tolist()
        if 'Net loss' in df.columns:
            new_columns[new_columns.index('Net loss')] = 'Net income'
        if 'Total shareholders equity' in df.columns:
            new_columns[new_columns.index('Total shareholders equity')] = 'Total stockholders equity'
        df.columns = new_columns
        if year < 2015: # After 2014, Nvidia changed from counting in thousands to counting in millions
            for col in df.columns:
                if col != 'Diluted' and col != 'Year' and col != 'Quarter':
                    df[col] = round(df[col] / 1000)
        #print(year, quarter, df.columns)
        combined_df = combined_df.append(df)[df.columns.tolist()]

    return combined_df

In [34]:
def get_ti_data():
    searchterms_ti = ['Net revenue', 'Cost of revenue', 'Revenue', 'Net income', 'Diluted', 'Total current assets', 'Total assets', 
                      'Total current liabilities', 'Total liabilities', 'Total stockholders equity', 'Depreciation', 
                      'operating activities', 'Cash and cash equivalents at beginning', 'Cash and cash equivalents beginning', 
                      'Cash and cash equivalents at end', 'Cash and cash equivalents end']
    reports_ti = get_html_tables('Texas Instruments', startyear=2008, startquarter=1, endyear=2018, endquarter=4, 
                                            max_tables=20, limit=5, search_terms=searchterms_ti)

    combined_df = pd.DataFrame()

    for report in reports_ti:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        new_columns = df.columns.tolist()
        if 'Net revenue' in df.columns:
            new_columns[new_columns.index('Net revenue')] = 'Revenue'
        if 'Cash and cash equivalents at beginning' in df.columns:
            new_columns[new_columns.index('Cash and cash equivalents at beginning')] = 'Cash and cash equivalents beginning'
        if 'Cash and cash equivalents at end' in df.columns:
            new_columns[new_columns.index('Cash and cash equivalents at end')] = 'Cash and cash equivalents end'
        df.columns = new_columns
        combined_df = combined_df.append(df)[df.columns.tolist()]

    diluted_col = combined_df['Diluted'].tolist()
    # Manually inputting these values as it was just the 1st quarter with different formatting 
    diluted_col[0:3] = [0.49, 0.44, 0.43] 
    combined_df['Diluted'] = diluted_col
    cash_begin_col = combined_df['Cash and cash equivalents beginning'].tolist()
    cash_begin_col[0] = 1328
    cash_end_col = combined_df['Cash and cash equivalents end'].tolist()
    cash_end_col[0] = 1450
    combined_df['Cash and cash equivalents beginning'] = cash_begin_col
    combined_df['Cash and cash equivalents end'] = cash_end_col
    return combined_df

In [152]:
def get_amd_data():
    searchterms_amd = ['Net revenue', 'Cost of sales', 'Net income loss per common share', 'Net income', 'Net loss', 'Diluted',  
                       'diluted', 'Total current assets', 'Total assets', 'Total current liabilities', 
                       'Total stockholders equity', 'Total liability and stockholders equity', 
                       'Depreciation', 'operating activities', 'cash equivalents at beginning', 
                       'Cash cash equivalents and restricted cash at beginning', 'cash equivalents at end', 
                       'Cash cash equivalents and restricted cash at end']
    reports_amd = get_html_tables('Advanced Micro Devices', startyear=2008, startquarter=1, endyear=2018, endquarter=4, 
                                        max_tables=20, limit=5, search_terms=searchterms_amd)

    combined_df = pd.DataFrame()

    for report in reports_amd:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        #print(year, quarter, df.columns)
        new_columns = df.columns.tolist()
        if 'Net income loss per common share' in df.columns:
            if 'diluted' in df.columns:
                df.drop('diluted', axis=1, inplace=True)
                new_columns = df.columns.tolist()
                new_columns[new_columns.index('Net income loss per common share')] = 'Diluted'
            else:
                new_columns[new_columns.index('per common share')] = 'Diluted'
        if 'diluted' in df.columns and not 'Diluted' in df.columns:
            new_columns[new_columns.index('diluted')] = 'Diluted'
        if 'Net loss' in df.columns and not 'Net income' in df.columns:
            new_columns[new_columns.index('Net loss')] = 'Net income'
        if 'Cash cash equivalents and restricted cash at beginning' in df.columns:
            new_columns[new_columns.index('Cash cash equivalents and restricted cash at beginning')] = 'cash equivalents at beginning'
        if 'Cash cash equivalents and restricted cash at end' in df.columns:
            new_columns[new_columns.index('Cash cash equivalents and restricted cash at end')] = 'cash equivalents at end'        
        #print(new_columns)
        #display(df)
        df.columns = new_columns
        
        combined_df = combined_df.append(df)[df.columns.tolist()]

    return combined_df

In [256]:
def get_intel_data():
    searchterms_intel = ['Net revenue', 'Cost of sales', 'Operating expenses', 'Net income', 'Diluted earnings', 
                         'Total current assets', 'Total assets', 'Total current liabilities', 'Total stockholders equity', 
                         'Cash and cash equivalents beginning', 'Net cash provided by operating activities', 'Depreciation', 
                         'Cash and cash equivalents end', 'Earnings per share Diluted']
    reports_intel1 = get_html_tables('Intel Corp', startyear=2008, startquarter=1, endyear=2017, endquarter=4, 
                                    search_terms=searchterms_intel)
    reports_intel2 = get_html_tables('Intel Corp', startyear=2018, startquarter=1, endyear=2018, endquarter=4, 
                                    search_terms=searchterms_intel, max_tables=40)
    reports_intel = iter.chain(reports_intel1, reports_intel2)
    
    combined_df = pd.DataFrame()

    for report in reports_intel:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        if 'Earnings per share Diluted' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('Earnings per share Diluted')] = 'Diluted earnings per share'
            df.columns = new_columns
        else:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('Diluted earnings')] = 'Diluted earnings per share'
            df.columns = new_columns
        combined_df = combined_df.append(df)

    return combined_df

In [111]:
def get_cisco_data():
    searchterms_cisco = ['Total net sales', 'Total revenue', 'Total cost of sales', 'Total operating expenses', 'NET INCOME', 
                         'Net income per share diluted', 'Diluted', 'Total current assets', 'TOTAL ASSETS', 'Total current liabilities', 
                         'Total shareholders equity', 'Total equity', 'Total Cisco shareholders equity', 'Depreciation', 
                         'Net cash provided by operating activities', 'Cash and cash equivalents beginning', 
                         'Cash and cash equivalents end', 'Cash cash equivalents and restricted cash beginning', 
                         'Cash cash equivalents and restricted cash end']
    reports_cisco = get_html_tables('Cisco Systems', startyear=2008, startquarter=1, endyear=2018, endquarter=4, 
                                         search_terms=searchterms_cisco)

    combined_df = pd.DataFrame()

    for report in reports_cisco:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        #print(year, quarter, len(df.columns))
        new_columns = df.columns.tolist()
        if 'Total Cisco shareholders equity' in df.columns:
            new_columns[new_columns.index('Total Cisco shareholders equity')] = 'Total shareholders equity'
        if 'Total equity' in df.columns:
            new_columns[new_columns.index('Total equity')] = 'Total shareholders equity'
        if 'Diluted' in df.columns:
            new_columns[new_columns.index('Diluted')] = 'Net income per share diluted'
        if 'Total revenue' in df.columns:
            new_columns[new_columns.index('Total revenue')] = 'Total net sales'
        if 'Cash cash equivalents and restricted cash beginning' in df.columns:
            new_columns[new_columns.index('Cash cash equivalents and restricted cash beginning')] = 'Cash and cash equivalents beginning'
        if 'Cash cash equivalents and restricted cash end' in df.columns:
            new_columns[new_columns.index('Cash cash equivalents and restricted cash end')] = 'Cash and cash equivalents end'
        df.columns = new_columns

        combined_df = combined_df.append(df)[df.columns.tolist()] # TODO: Why is the last part necessary?
        #print(combined_df.columns)

    return combined_df

In [123]:
def get_amazon_data():
    searchterms_amazon = ['CASH AND CASH EQUIVALENTS BEGINNING', 'CASH CASH EQUIVALENTS AND RESTRICTED CASH BEGINNING', 
                          'Net income', 'Depreciation', 'Net cash provided by' ,'CASH AND CASH EQUIVALENTS END', 
                          'CASH CASH EQUIVALENTS AND RESTRICTED CASH END', 'Net sales', 'Total net sales', 'Cost of sales', 
                          'Total operating expenses', 'Diluted earnings per share', 'Total current assets', 'Total assets', 
                          'Total current liabilities', 'Total stockholders equity']
    mappings_amazon = {term: [[1], 1] for term in searchterms_amazon}
    reports_amazon1 = get_html_tables('Amazon com Inc', startyear=2008, startquarter=1, endyear=2016, endquarter=4, 
                                     search_terms=searchterms_amazon, limit=4)
    reports_amazon2 = get_html_tables('Amazon com Inc', startyear=2017, startquarter=1, endyear=2018, endquarter=4, 
                                     search_terms=searchterms_amazon, limit=4, mapping=mappings_amazon)
    combined_reports = iter.chain(reports_amazon1, reports_amazon2)

    combined_df = pd.DataFrame()

    for report in combined_reports:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        if 'Total net sales' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('Total net sales')] = 'Net sales'
            df.columns = new_columns
        if 'CASH CASH EQUIVALENTS AND RESTRICTED CASH BEGINNING' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('CASH CASH EQUIVALENTS AND RESTRICTED CASH BEGINNING')] = 'CASH AND CASH EQUIVALENTS BEGINNING'
            df.columns = new_columns
        if 'CASH CASH EQUIVALENTS AND RESTRICTED CASH END' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('CASH CASH EQUIVALENTS AND RESTRICTED CASH END')] = 'CASH AND CASH EQUIVALENTS END'
            df.columns = new_columns
        combined_df = combined_df.append(df)

    return combined_df

In [7]:
def get_ibm_data():    
    searchterms_ibm = ['Total revenue', 'Total cost', 'Total expense and other income', 'Net income', 'Assuming dilution', 
                       'Total current assets', 'Total assets', 'Total current liabilities', 'Total liabilities', 
                       'Total stockholders equity', 'Total IBM stockholders equity', 'Depreciation', 
                       'Net cash provided by operating activities', 'Cash and cash equivalents at', 
                       'Cash cash equivalents and restricted cash at']
    mappings_ibm = {'Cash and cash equivalents at': [[0], 2], 'Cash cash equivalents and restricted cash at': [[0], 2]}
    reports_ibm = get_html_tables('International Business Machines', startyear=2008, startquarter=1, endyear=2018, endquarter=4, 
                                  search_terms=searchterms_ibm, limit=5, mapping=mappings_ibm)

    combined_df = pd.DataFrame()

    for report in reports_ibm:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        #print(year, quarter, len(df.columns))
        if year == 2008:
            df['Assuming dilution'] = round(df['Net income'] / df['Assuming dilution'], 2)
        if 'Total IBM stockholders equity' in df.columns:
            if 'Total stockholders equity' in df.columns:
                df.drop('Total IBM stockholders equity', axis=1, inplace=True)
            else:
                new_columns = df.columns.tolist()
                new_columns[new_columns.index('Total IBM stockholders equity')] = 'Total stockholders equity'
                df.columns = new_columns
        if 'Cash cash equivalents and restricted cash at1' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('Cash cash equivalents and restricted cash at1')] = 'Cash and cash equivalents at1'
            df.columns = new_columns
        if 'Cash cash equivalents and restricted cash at2' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('Cash cash equivalents and restricted cash at2')] = 'Cash and cash equivalents at2'
            df.columns = new_columns
        combined_df = combined_df.append(df)

    new_columns = ['Earnings/diluted share' if col == 'Assuming dilution' else col for col in combined_df.columns.tolist()]
    new_columns = ['Cash and cash equivalents beginning' if col == 'Cash and cash equivalents at1' else col for col in new_columns]
    new_columns = ['Cash and cash equivalents end' if col == 'Cash and cash equivalents at2' else col for col in new_columns]
    combined_df.columns = new_columns
    return combined_df

In [225]:
def get_google_data():
    searchterms_google = ['Revenues', 'Google advertising and other', 'Cost of revenues', 'Total costs and expenses', 
                           'stock diluted', 'share diluted', 'Diluted', 'Net income', 'Total current assets', 
                          'Total assets', 'Total current liabilities', 'Total stockholders equity', 'Depreciation', 
                           'Net cash provided by operating activities', 'Cash and cash equivalents at beginning', 
                          'Cash and cash equivalents at end']
    mappings_google = {'Revenues': [[1], 1], 'Google advertising and other': [[1], 1], 'stock diluted': [[1], 1],
                       'share diluted': [[1], 1], 'Cost of revenues': [[1], 1], 'Total costs and expenses': [[1], 1], 
                       'Net income': [[1], 1], 'Diluted': [[1], 1]}
    reports_google = get_html_tables('Google Inc', startyear=2008, startquarter=1, endyear=2015, endquarter=4, 
                                      mapping=mappings_google, search_terms=searchterms_google, default_side='right', 
                                     max_tables=50)
    reports_alphabet = get_html_tables('Alphabet Inc', startyear=2016, startquarter=1, endyear=2018, endquarter=4, 
                                      mapping=mappings_google, search_terms=searchterms_google, default_side='right', 
                                       max_tables=90)
    reports_google_combined = iter.chain(reports_google, reports_alphabet)
    
    combined_df = pd.DataFrame()
        
    for report in reports_google_combined:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        #display(df)
        if 'Google advertising and other' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('Google advertising and other')] = 'Revenues'
            df.columns = new_columns
        if 'Diluted' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('Diluted')] = 'share diluted'
            df.columns = new_columns
        if 'stock diluted' in df.columns:
            new_columns = df.columns.tolist()
            new_columns[new_columns.index('stock diluted')] = 'share diluted'
            df.columns = new_columns
        if year < 2010: # After 2010, Google changed from counting in thousands to counting in millions
            for col in df.columns:
                if col.lower() != 'share diluted' and col.lower() != 'year' and col.lower() != 'quarter':
                    df[col] = round(df[col] / 1000)
        combined_df = combined_df.append(df)
    
    new_columns = ['Earnings/diluted share' if col == 'share diluted' else col for col in combined_df.columns.tolist()]
    combined_df.columns = new_columns
    return combined_df

In [139]:
def get_apple_data():    
    searchterms_apple = ['Net sales', 'Cost of sales', 'Total operating expenses', 'Net income', 'Diluted', 
                         'Total current assets', 'Total assets', 'Total current liabilities', 'Total liabilities', 
                         'Total shareholders equity', 'Cash and cash equivalents beginning of the period', 
                         'Depreciation', 'Cash generated by operating activities', 
                         'Cash and cash equivalents end of the period']
    mappings_apple = {'Diluted':[[0], 2]} # TODO: Get rid of number of diluted shares
    reports_apple = get_html_tables('Apple Inc', startyear=2008, startquarter=1, endyear=2018, endquarter=4, 
                                    mapping=mappings_apple, search_terms=searchterms_apple)

    combined_df = pd.DataFrame()

    for report in reports_apple:
        year = report[0]
        quarter = report[1]
        df = report[2].copy()
        #display(df)
        #print(year, quarter)
        combined_df = combined_df.append(df)
    
    new_columns = ['Earnings/diluted share' if col == 'Diluted1' 
                   else 'Number of diluted shares (thousands)' if col == 'Diluted2'
                   else col for col in combined_df.columns.tolist()]
    combined_df.columns = new_columns
    return combined_df

In [30]:
def transform_df(df, company_name, symbol, companies, late):
    """
    Creates one-hot-encoding for the company name and labels each quarter with the corresponding adjusted stock value 
    at that time using the Alpha Vantage API: https://www.alphavantage.co/
    
    Parameters:
    df (DataFrame): the DataFrame of a company
    company_name (str): the name of the company
    symbol (str): the NASDAQ symbol of the stock
    companies (list of str): the list of all companies
    late (boolean): whether or not the reports for this company are filed roughly 1 month later than usual
    
    Returns:
    DataFrame: the modified DataFrame
    
    """
    
    key = 'BWT9OO9T59N87MUI'
    ts = TimeSeries(key)
    adjusted_results, meta = ts.get_daily_adjusted(symbol=symbol, outputsize='full')
    time.sleep(15)
    
    prices = []
    for i in range(len(df)):
        year = int(df.iloc[i]['Year'])
        quarter = df.iloc[i]['Quarter']
        start_month = 1 if quarter == 1 else 4 if quarter == 2 else 7 if quarter == 3 else 10
        if late:
            start_month += 1
        for day in range(1, 29):
            start_date = f'{year}-{start_month:02}-{day:02}'
            if start_date in adjusted_results:
                prices.append(float(adjusted_results[start_date]['5. adjusted close']))
                break
                
    df['Stock price'] = prices
    
    for name in companies:
        if name == company_name:
            df.loc[0:, name] = 1
        else:
            df.loc[0:, name] = 0
    return df

In [50]:
def combine_dataframes():
    """
    Combines all DataFrames after modifying them with transform_df. SEE DOCUMENTATION FOR THAT FUNCTION!
    This was used only for the initial creation of the combined DataFrame and should not be called again!
    
    """
    
    amazon_df = pd.read_csv('amazon_data_formatted.csv', index_col=0) 
    amd_df = pd.read_csv('amd_data_formatted.csv', index_col=0)
    apple_df = pd.read_csv('apple_data_formatted.csv', index_col=0) 
    cisco_df = pd.read_csv('cisco_data_formatted.csv', index_col=0) 
    google_df = pd.read_csv('google_data_formatted.csv', index_col=0) 
    ibm_df = pd.read_csv('ibm_data_formatted.csv', index_col=0) 
    intel_df = pd.read_csv('intel_data_formatted.csv', index_col=0)
    nvidia_df = pd.read_csv('nvidia_data_formatted.csv', index_col=0) 
    ti_df = pd.read_csv('ti_data_formatted.csv', index_col=0)

    df_list = [amazon_df, amd_df, apple_df, cisco_df, google_df, ibm_df, intel_df, nvidia_df, ti_df]
    companies = ['Amazon', 'AMD', 'Apple', 'Cisco', 'Google', 'IBM', 'Intel', 'Nvidia', 'TI']
    symbols = ['AMZN', 'AMD', 'AAPL', 'CSCO', 'GOOGL', 'IBM', 'INTC', 'NVDA', 'TXN']

    combined_df = pd.DataFrame()

    for i in range(len(df_list)):
        company_name = companies[i]
        is_late = True if company_name == 'Cisco' or company_name == 'Nvidia' else False
        df = transform_df(df_list[i], company_name, symbols[i], companies, is_late)
        df_list[i]['Stock price'] = df['Stock price'].tolist()
        combined_df = combined_df.append(df)[df.columns.tolist()]
        df_list[i].to_csv(company_name + '_data_adjusted.csv')

    return combined_df

In [51]:
combine_dataframes().to_csv('Combined_data_adjusted.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [11]:
def combine_dataframes(df_list, company_list, symbol_list, late_list, extend_list):
    """
    Combines all user-supplied DataFrames after modifying them with transform_df.
    This is meant to called by the user with additional DataFrames that match our format.
    
    Parameters:
    df_list (list of DataFrame): a list of the DataFrames to be combined.
    company_list (list of str): the list of company names/nicknames to label the corresponding companies
    symbol_list (list of str): the list of NASDAQ symbols corresponding to each company
    late_list (list of bool): a list describing whether or not each company files its 10-Qs "late".
        Most companies file their reports at the end of March, June, September, and/or December.
        Late companies are those that file them at the end of April, July, October, and/or January.
    extend_list (bool): whether or not to include one-hot-encoding of our original list of companies
        in addition to the user-supplied companies
    
    """
    combined_df = pd.DataFrame()
    
    if len(df_list) == len(company_list) and len(df_list) == len(symbol_list):
        company_list += ['Amazon', 'AMD', 'Apple', 'Cisco', 'Google', 'IBM', 'Intel', 'Nvidia', 'TI']
        for i in range(len(df_list)):
            company_name = company_list[i]
            is_late = late_list[i]
            df = transform_df(df_list[i], company_name, symbol_list[i], company_list, is_late)
            combined_df = combined_df.append(df)[df.columns.tolist()]
            
    return combined_df