In [2]:
import pandas as pd
import quandl, math
import sklearn as skl
import sklearn.linear_model as lreg
import pandas_datareader as wb
import datetime
import numpy as np

In [6]:
start = datetime.datetime(2000, 1, 1)
end = datetime.datetime(2017, 1, 1)
df = wb.DataReader('GOOGL', 'morningstar', start, end)

In [7]:
df.columns

Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')

#### Features or Independent Variable
Features are the attributes that make up the label

Label is a predictor of the future

Ex. Close, High, Low, Open, Volume of a stock

In [8]:
df['HL_PCT'] = (df['High'] - df['Low']) / df['Low'] * 100
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100

df = df[['Close','HL_PCT','PCT_change','Volume']]
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,HL_PCT,PCT_change,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GOOGL,2004-08-19,50.2161,8.440965,0.324852,44662908
GOOGL,2004-08-20,54.2075,8.537199,6.730368,22778271
GOOGL,2004-08-23,54.753,4.062458,-1.227953,18234539
GOOGL,2004-08-24,52.4858,7.753226,-5.726382,15241412
GOOGL,2004-08-25,53.0514,3.966124,0.991038,9130694


# Features and Labels
Close could be a label if there was no HL_PCT column and we were trying to predict what the close might be that day

#### What would be a label (dependent variable)
Close in the next five days

Generally we use Linear Regression to forescast out the future

#### Fill -9999 with NaN because we don't want to remove vital stock data from a column

In [9]:
forecast_col = 'Close'
df.fillna(-9999, inplace=True)

1. Forecast the number of days out; Predict out 10% of the dataframe and you'll see that actually when we go out and do this
2. Create the label column

In [10]:
forecast_out = int(math.ceil(0.01*len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,HL_PCT,PCT_change,Volume,label
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GOOGL,2004-08-19,50.2161,8.440965,0.324852,44662908,69.252
GOOGL,2004-08-20,54.2075,8.537199,6.730368,22778271,68.6064
GOOGL,2004-08-23,54.753,4.062458,-1.227953,18234539,69.4923
GOOGL,2004-08-24,52.4858,7.753226,-5.726382,15241412,68.9317
GOOGL,2004-08-25,53.0514,3.966124,0.991038,9130694,67.6955


# Regression Training and Testing
#### X are features and y is label

In [11]:
X = np.array(df.drop(['label'], 1))
y = np.array(df['label'])

#### Scaling 

In [12]:
X = skl.preprocessing.scale(X)
scaling_url = "http://scikit-learn.org/stable/modules/preprocessing.html"

In [13]:
print("Lenght of X = {}, y = {}".format(len(X),len(y)))

Lenght of X = 3194, y = 3194


#### Train and Test

In [14]:
X_train, X_test, y_train, y_test = skl.model_selection.train_test_split(X, y, test_size = 0.2)

#### Linear Regression Classifier
1. Identify a classifier (Create object)
2. Fit the training sets

In [18]:
clf = lreg.LinearRegression()
clf.fit(X_train, y_train)
y_train

array([339.9441, 437.2733, 351.9457, ..., 444.3301, 312.3924, 157.1221])

#### Returns the coefficient of determination $R^2$ of the prediction.

In [16]:
accuracy = clf.score(X_test, y_test)
print("Coefficient of Determination R^2 = {}".format(accuracy))

Coefficient of Determination R^2 = 0.9666017514738591
