# Linear Regression - Fiting a linear model to stock data

In [1]:
import pandas as pd
import quandl as qdl

# downloading the stock data
df = qdl.get('WIKI/GOOGL')

# taking just the useful data to create our feature vector.
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]

# transforming data to have more useful features
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0 #we may ignore 100
df['PCT_Change'] = (df['Adj. Open'] - df['Adj. Close']) / df['Adj. Close'] * 100.0

# creating final features
df = df[['Adj. Close', 'HL_PCT', 'PCT_Change', 'Adj. Volume']]

print(df.head())

            Adj. Close    HL_PCT  PCT_Change  Adj. Volume
Date                                                     
2004-08-19   50.322842  8.441017   -0.323915   44659000.0
2004-08-20   54.322689  8.537313   -6.739913   22834300.0
2004-08-23   54.869377  4.062357    1.243144   18256100.0
2004-08-24   52.597363  7.753210    6.074187   15247300.0
2004-08-25   53.164113  3.966115   -1.169811    9188600.0


In [2]:
import math
print('total data points: '+str(len(df)))

forecast_col = 'Adj. Close'

# filling in NA data points
df.fillna(-99999, inplace=True) #outlier in our dataset

forecast_out = int(math.ceil(0.01*len(df)))

# Creating lables which are the predictions
# here, each row has lable col is Adj. close price
# forecast_out days in future.
df['label'] = df[forecast_col].shift(-forecast_out)

print('prediction is done for '
      + str(forecast_out)+ ' days \n')
# removing the rows which has label as NaN.
df.dropna(inplace=True)
print(df.tail())

total data points: 3240
prediction is done for 33 days 

            Adj. Close    HL_PCT  PCT_Change  Adj. Volume   label
Date                                                             
2017-05-09      956.71  0.817267    0.482905    1633396.0  972.09
2017-05-10      954.84  0.723280    0.144527    1135809.0  948.09
2017-05-11      955.89  0.988815   -0.481227    1016797.0  961.01
2017-05-12      955.14  0.621810    0.283728    1204133.0  937.82
2017-05-15      959.22  1.036922   -0.409708    1314574.0  929.68


In [3]:
import numpy as np
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = np.array(df.drop(['label'], 1))
y = np.array(df['label'])

# scaling the data set
X = preprocessing.scale(X)

# Splitting the dataset into train and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# creating Linear Regression model

clf = LinearRegression(n_jobs=4)
#clf = svm.SVR(kernel='linear')

# training the model to fit the data set
clf.fit(X_train, y_train)

# testing the accuracy of the trained model
accuracy = clf.score(X_test, y_test)

print('accuracy "Squared Error" of our trained model is '+str(accuracy))


accuracy "Squared Error" of our trained model is 0.972491456369
