In [1]:
import pandas as pd
import quandl, math
import sklearn as skl
import sklearn.linear_model as lreg
import pandas_datareader as wb
import datetime
import numpy as np

In [2]:
start = datetime.datetime(2000, 1, 1)
end = datetime.datetime(2017, 1, 1)
df = wb.DataReader('GOOGL', 'morningstar', start, end)

In [3]:
df.columns

Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')

#### Features or Independent Variable
Features are the attributes that make up the label

Label is a predictor of the future

Ex. Close, High, Low, Open, Volume of a stock

In [4]:
df['HL_PCT'] = (df['High'] - df['Low']) / df['Low'] * 100
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100

df = df[['Close','HL_PCT','PCT_change','Volume']]
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,HL_PCT,PCT_change,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GOOGL,2004-08-19,50.2161,8.440965,0.324852,44662908
GOOGL,2004-08-20,54.2075,8.537199,6.730368,22778271
GOOGL,2004-08-23,54.753,4.062458,-1.227953,18234539
GOOGL,2004-08-24,52.4858,7.753226,-5.726382,15241412
GOOGL,2004-08-25,53.0514,3.966124,0.991038,9130694


# Features and Labels
Close could be a label if there was no HL_PCT column and we were trying to predict what the close might be that day

#### What would be a label (dependent variable)
Close in the next five days

Generally we use Linear Regression to forescast out the future

#### Fill -9999 with NaN because we don't want to remove vital stock data from a column

In [5]:
forecast_col = 'Close'
df.fillna(-9999, inplace=True)

1. Forecast the number of days out; Predict out 10% of the dataframe and you'll see that actually when we go out and do this
2. Create the label column

In [6]:
forecast_out = int(math.ceil(0.01*len(df))) #10% of the dataframe; last 33 days
print(forecast_out)

33


In [7]:
df['label'] = df[forecast_col].shift(-forecast_out)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,HL_PCT,PCT_change,Volume,label
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GOOGL,2004-08-19,50.2161,8.440965,0.324852,44662908,69.252
GOOGL,2004-08-20,54.2075,8.537199,6.730368,22778271,68.6064
GOOGL,2004-08-23,54.753,4.062458,-1.227953,18234539,69.4923
GOOGL,2004-08-24,52.4858,7.753226,-5.726382,15241412,68.9317
GOOGL,2004-08-25,53.0514,3.966124,0.991038,9130694,67.6955


# Regression Training and Testing
#### X are features and y is label
#### Scaling 

In [8]:
X = np.array(df.drop(['label'], 1))
X = skl.preprocessing.scale(X)
scaling_url = "http://scikit-learn.org/stable/modules/preprocessing.html"

X_lately = X[-forecast_out:] #last 30 days of stock prices
X = X[:-forecast_out] # stock prices up to last 30 days

df.dropna(inplace=True)
y = np.array(df['label'])

In [9]:
print("Lenght of X = {}, y = {}".format(len(X),len(y)))

Lenght of X = 3194, y = 3194


#### Train and Test

In [10]:
X_train, X_test, y_train, y_test = skl.model_selection.train_test_split(X, y, test_size = 0.2)

#### Linear Regression Classifier
1. Identify a classifier (Create object)
2. Fit the training sets

In [11]:
clf = lreg.LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Returns the coefficient of determination $R^2$ of the prediction.
How much of forecasted data is explained by the regression equation of the train data

In [12]:
accuracy = clf.score(X_test, y_test)
print("Coefficient of Determination R^2 = {}".format(accuracy))

Coefficient of Determination R^2 = 0.9670073791034203


# Regression Forecasting and Predicting
#### Predict last 30 days of Stock data

In [17]:
forecast_set = clf.predict(X_lately)
print('The next {} days stock prices with a accuracy of {} are forecasted as: \n{}'.format(forecast_out, accuracy, forecast_set))

The next 33 days stock prices with a accuracy of 0.9670073791034203 are forecasted as: 
[785.30236068 791.58778301 780.80705258 790.24772775 790.28252307
 784.04522114 785.43512962 786.15015381 790.47196744 794.69010941
 780.35867798 768.42947631 769.9716506  783.69134049 781.34635006
 796.54849521 800.35314448 814.4961898  813.04527759 819.99642008
 822.67799993 820.39711332 814.05668733 817.65621999 820.43496612
 817.10965721 815.03457349 813.32529946 813.86731469 815.25229045
 809.55717168 808.29416619 797.19483225]


In [25]:
type(last_date[1])

pandas._libs.tslib.Timestamp

#### Create Date variables for the next 30 days

In [26]:
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date[1]
one_day = 86400
next_unix = last_unix + one_day

ValueError: Cannot add integral value to Timestamp without freq.

In [21]:
for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)+1] + [i]
    
df['Close'].plot()

NameError: name 'next_unix' is not defined