In [1]:
import bs4 as bs
import pickle
import requests
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import numpy as np
import matplotlib.pyplot as plt

Collect stock data from google finance on the top tech companies in the United States of America

In [2]:
# Top Tech Companies
ticker_tech_comp = ['GOOG', "AMZN", "FB", "AAPL", "MSFT"]
tech_comp = ["google","amazon","facebook","apple","microsoft"]                

In [212]:
for ticker in ticker_tech_comp:
    print(web.DataReader(ticker, 'google', dt.datetime(2000,1,1), dt.date.today() - dt.timedelta(days=1)).head())
         

             Open   High    Low  Close  Volume
Date                                          
2004-08-19  49.96  51.98  47.93  50.12     NaN
2004-08-20  50.69  54.49  50.20  54.10     NaN
2004-08-23  55.32  56.68  54.47  54.65     NaN
2004-08-24  55.56  55.74  51.73  52.38     NaN
2004-08-25  52.43  53.95  51.89  52.95     NaN
             Open   High    Low  Close    Volume
Date                                            
2001-07-26  11.67  12.75  11.23  12.43  10067400
2001-07-27  12.33  12.43  11.70  12.25   8905600
2001-07-30  12.21  12.75  11.98  12.55   4930900
2001-07-31  12.64  12.86  12.33  12.49   4286800
2001-08-01  12.73  12.86  12.17  12.50   5432100
             Open   High    Low  Close     Volume
Date                                             
2012-05-17    NaN    NaN    NaN  38.00          0
2012-05-18  42.05  45.00  38.00  38.23  580587742
2012-05-21  36.53  36.66  33.00  34.03  168309831
2012-05-22  32.61  33.59  30.94  31.00  102053826
2012-05-23  31.37  32.50  31

We can see that May 18, 2012 is the latest starting date for stock info. Since we want the size of our stock features to be the same for all of the companies, we will use 2012,5,18 as our starting date

In [276]:
def get_data_from_google(tickers, reload=False):            
    if not os.path.exists('tech_stock_dfs'):
        os.makedirs('tech_stock_dfs')
        
    start = dt.datetime(2012,5,18)
    end = dt.date.today() - dt.timedelta(days=1)
    for ticker in tickers:
        if reload is True or not os.path.exists('tech_stock_dfs/{}.csv'.format(ticker)):
            df = web.DataReader(ticker, 'google', start, end)
            df.to_csv('tech_stock_dfs/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))        

In [277]:
get_data_from_google(ticker_tech_comp, reload=True)

In [281]:
# Create a csv file with all the company close rates
def compile_data(tickers):
    # Create directory
    if not os.path.exists('final_stock_features_dfs'):
        os.makedirs('final_stock_features_dfs')
        
    for ticker in tickers:
        df = pd.read_csv('tech_stock_dfs/{}.csv'.format(ticker))
        vals = [dt.date.today(), 0, 0, 0, 0, 0]
        labels = ["Date","Open","High","Low", "Close", "Volume"]

        # Add an additional row with todays date
        today_df = pd.DataFrame(vals, index=labels)
        today_df = today_df.as_matrix().reshape(1,6)
        today_df = pd.DataFrame(today_df)
        today_df.columns = labels
        df = pd.concat([df, today_df], ignore_index=True)

        # Engineer New Features
        df["Open_Prev_1d"] = (df['Open'].shift(1))
        df["High_Prev_1d"] = (df['High'].shift(1))
        df["Low_Prev_1d"] = (df['Low'].shift(1))
        df["Close_Prev_1d"] = (df['Close'].shift(1))
        df["Volume_Prev_1d"] = (df['Volume'].shift(1))
        df['100_mov_avg'] = df['Close'].rolling(window=100, min_periods=0).mean()

        # Calculate Percentage Change as target
        df["Perc_Change_1d"] = (df['Close']) - df['Close'].shift(1) 
        df = df[1:] # Drop first row so there is no division by 0
        df["Perc_Change_1d"] = df["Perc_Change_1d"][:-1] / df['Close'][:-1]

        # Drop first 100 days from moving average
        df = df[100:]
        
        # Only save the columns used for the prediction
        df.drop(["Date","Open","High","Low","Close","Volume"],axis=1,inplace=True)
        
        # Drop any missing data other than the current day which we will predict on
        if df[:-1].isnull().values.any() is True:
            df[:-1].dropna(inplace=True)
        
        # Set Percentage change 1 day as the target
        df.set_index('Perc_Change_1d', inplace=True)
        
        # Save as csv file
        df.to_csv('final_stock_features_dfs/{}.csv'.format(ticker))

In [282]:
compile_data(ticker_tech_comp)

Finished Scraping Data, this notebook should be run each day as there as new stock information for the day will be released. Keep in mind that the last column is the one that the model after training will be tested on to predict the percentage change for the new day