# Baseline Model: Linear Regression

In [1]:
# This program predicts stock prices by using machine learning models

# Import the libraries
import quandl
import numpy as np
import pandas as pd
import pandas_datareader as web
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Import trading data from January 20, 2017 to January 20, 2020
df = pd.read_csv('../datasets/tweets/combined_trump_index.csv', sep='\t')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close,Return,Intraday Return,Volume Return,Open-Close Movement,High-Low Ratio,retweet_count,favorite_count,vader compound,vader sentiment,Trump Twitter Index,Close_TTI,Volume_TTI
0,1,2017-01-20,2276.959961,2265.01001,2269.959961,2271.310059,3524970000,2271.310059,0.003366,0.000595,0.113393,0.276982,0.527589,415061,1878671,2.2815,positive,0.135696,2269.965939,8042219000.0
1,2,2017-01-23,2271.780029,2257.02002,2267.780029,2265.199951,3152710000,2265.199951,-0.00269,-0.001138,-0.105607,-0.155418,0.65396,26750,177839,0.4939,positive,-0.056192,2267.777551,1557123000.0
2,3,2017-01-24,2284.629883,2266.679932,2267.879883,2280.070068,3810960000,2280.070068,0.006565,0.005375,0.208789,0.118309,0.791905,247498,1162781,1.6901,positive,0.908453,2267.91994,6440903000.0
3,4,2017-01-25,2299.550049,2288.879883,2288.879883,2298.370117,3846020000,2298.370117,0.008026,0.004146,0.0092,0.386383,0.466174,138054,761424,0.7527,positive,0.312087,2288.893518,2894899000.0
4,5,2017-01-26,2300.98999,2294.080078,2298.629883,2296.679932,3610360000,2296.679932,-0.000735,-0.000848,-0.061274,0.011302,0.301206,128887,609647,-0.4878,negative,0.041381,2298.631683,-1761134000.0


In [4]:
# Drop irrelevant columns
df.drop(columns = ['Unnamed: 0'], axis=1, inplace = True)

In [5]:
# Preprocess data
df.index = pd.DatetimeIndex(df['Date'])
df.sort_index(ascending=True, inplace=True)
df.drop('Date', axis=1, inplace=True)
df=df.loc[:'2020-01-20',:]

In [6]:
# Get the Close Price based on Trump Twitter Index
df = df[['Close_TTI']]

#Take a look at the new data
print(df.head())

              Close_TTI
Date                   
2017-01-20  2269.965939
2017-01-23  2267.777551
2017-01-24  2267.919940
2017-01-25  2288.893518
2017-01-26  2298.631683


In [7]:
# A variable for predicting n=30 days out into the future
forecast_out = 30 # n=30 days

# Create another column (the target or dependent variable) shifted 'n' units up
df['Prediction'] = df[['Close_TTI']].shift(-forecast_out)

# Print the new data set
print(df.tail())

              Close_TTI  Prediction
Date                               
2020-01-13  3271.117341         NaN
2020-01-14  3285.341753         NaN
2020-01-15  3282.278416         NaN
2020-01-16  3303.024708         NaN
2020-01-17  3323.687711         NaN


In [8]:
### Create the independent data set (X)  #######
# Convert the dataframe to a numpy array
X = np.array(df.drop(['Prediction'],1))

#Remove the last 'n' rows
X = X[:-forecast_out]
print(X)

[[2269.96593885]
 [2267.77755148]
 [2267.9199402 ]
 [2288.89351774]
 [2298.63168304]
 [2299.00810699]
 [2286.00662248]
 [2274.0368119 ]
 [2285.60885808]
 [2276.7085474 ]
 [2288.54326724]
 [2294.28826004]
 [2295.87883345]
 [2289.5682007 ]
 [2296.72024447]
 [2312.26220692]
 [2321.75066935]
 [2326.1145067 ]
 [2335.5838446 ]
 [2349.64621896]
 [2343.00385024]
 [2354.89332434]
 [2361.11378877]
 [2367.50277483]
 [2355.69443128]
 [2365.2439082 ]
 [2366.07878793]
 [2380.13997603]
 [2394.73793243]
 [2380.91935327]
 [2375.23037136]
 [2370.73864383]
 [2369.7756078 ]
 [2363.49519763]
 [2372.52001953]
 [2371.5627722 ]
 [2368.54513898]
 [2370.38764778]
 [2387.67197931]
 [2383.68615631]
 [2378.23341378]
 [2379.22389239]
 [2342.99375151]
 [2345.96997311]
 [2350.40005582]
 [2329.14347247]
 [2339.82653833]
 [2356.54249628]
 [2361.31614908]
 [2364.80932223]
 [2362.32239363]
 [2354.77561573]
 [2366.56824467]
 [2353.79523672]
 [2356.59008789]
 [2357.15991211]
 [2353.91918456]
 [2352.10055173]
 [2341.9231434

In [9]:
### Create the dependent data set (y)  #####
# Convert the dataframe to a numpy array (All of the values including the NaN's)
y = np.array(df['Prediction'])

# Get all of the y values except the last n=10 rows
y = y[:-forecast_out]
print(y)

[2375.23037136 2370.73864383 2369.7756078  2363.49519763 2372.52001953
 2371.5627722  2368.54513898 2370.38764778 2387.67197931 2383.68615631
 2378.23341378 2379.22389239 2342.99375151 2345.96997311 2350.40005582
 2329.14347247 2339.82653833 2356.54249628 2361.31614908 2364.80932223
 2362.32239363 2354.77561573 2366.56824467 2353.79523672 2356.59008789
 2357.15991211 2353.91918456 2352.10055173 2341.92314349 2332.67104797
 2342.53090445 2346.75572715 2342.69130238 2354.75831343 2370.32345823
 2381.51801047 2388.98436475 2389.69552091 2393.65236051 2388.49988509
 2391.05047906 2386.50311989 2389.78929809 2392.38904246 2399.94019704
 2401.57178895 2396.78871272 2394.83925373 2392.43350791 2393.98787214
 2404.54480295 2382.90867795 2354.68743285 2371.3856697  2387.21427729
 2397.04742935 2401.41719984 2409.55228805 2414.50245296 2411.66765409
 2415.61346418 2415.67763924 2431.2800293  2437.827892   2431.91699626
 2432.03124635 2434.27001953 2436.35247827 2425.8900325  2434.14647528
 2443.

In [10]:
# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [11]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((579, 1), (145, 1), (579,), (145,))

In [12]:
# Set x_forecast equal to the last n=30 rows of the original data set from Close_TTI column
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]
print(x_forecast)

[[3119.20622743]
 [3134.64174577]
 [3141.84862724]
 [3135.35082544]
 [3135.75306703]
 [3141.2541305 ]
 [3166.6556632 ]
 [3183.64080336]
 [3195.37468012]
 [3195.19281735]
 [3192.32204554]
 [3223.30166704]
 [3226.06035174]
 [3225.4439518 ]
 [3227.12475913]
 [3247.17720544]
 [3240.07446862]
 [3215.23058293]
 [3244.63221373]
 [3226.35230939]
 [3217.55102568]
 [3241.84972297]
 [3238.59008789]
 [3266.03869032]
 [3281.73745629]
 [3271.11734128]
 [3285.34175289]
 [3282.27841622]
 [3303.0247076 ]
 [3323.68771057]]


### Train a Linear regression model on training set

In [13]:
# Create and train the Linear Regression  Model
linreg = LinearRegression()

#Cross-validation of train test split
print(cross_val_score(linreg, X_train, y_train, cv=3))
print(cross_val_score(linreg, X_train, y_train, cv=3).mean())

[0.8018343  0.79377674 0.72043614]
0.7720157259708582


In [14]:
# Create and train the Linear Regression  Model
linreg = LinearRegression()

# Train/Fit a Linear Regression Model onto the training set
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
# Predict on training set
y_pred_train = linreg.predict(X_train)

# Predict on test set
predictions = linreg.predict(X_test)

In [16]:
# Evaluating Performance Measures on training set

# Score returns the coefficient of determination R^2 (confidence) of the prediction. The best possible score is 1.0
score_train = linreg.score(X_train,y_train)
print ('R–Squared Score: {:4f}'.format(score_train))

trainScore = mean_squared_error(X_train, y_train)
print('Train Score: %.4f MSE (%.4f RMSE)' % (trainScore, math.sqrt(trainScore)))

R–Squared Score: 0.779210
Train Score: 11584.6624 MSE (107.6321 RMSE)


In [17]:
# Evaluating Performance Measures on validation (test) set

# Score returns the coefficient of determination R^2 (confidence) of the prediction. The best possible score is 1.0
score_test = linreg.score(X_test,y_test)
print ('R–Squared Score: {:4f}'.format(score_test))

testScore = mean_squared_error(X_test, y_test)
print('Test Score: %.8f MSE (%.8f RMSE)' % (testScore, math.sqrt(testScore)))

R–Squared Score: 0.836307
Test Score: 11845.45745466 MSE (108.83683868 RMSE)


### Note that our baseline is: RMSE 108.84

> From the above train and test scores, as the train score (107.63) is lower than the test (108.84) score, this implies that the the Linear Regression model is not overfit. The training performance and testing performance are relatively close which indicates some adjustments may be needed to be made to the model.
> 
> The above model R–Squared score of 0.836307 shows that the model has 83.6% accuracy rate in predicting the values. This is expected as predicting market prices is a difficult task as there are many other individual factors that may affect stock returns such as interest rates, inflation expectations, and other idiosyncratic factors.
>
> <i>Evaluation Metrics: R-squared value and Root Mean Squared Error (RMSE) are two metrics that can be used to evaluate the performance of the model. R-Squared is used for measuring the model accuracy. RMSE measures the average magnitude of the residuals or error. Ideally, lower RMSE and higher R-squared values are indicative of a good model.</i>

## Test the Linear Regression Model and Obtain Predictions

In [18]:
# Print linear regression model predictions for the next n=30 days
lr_prediction = linreg.predict(x_forecast)
print(lr_prediction)

[3116.20826224 3130.21778782 3136.75886976 3130.86135997 3131.22644093
 3136.21929492 3159.27413599 3174.69012335 3185.33997929 3185.1749177
 3182.56935887 3210.68696336 3213.19079012 3212.63133559 3214.15686345
 3232.35678738 3225.91022879 3203.36151715 3230.04691183 3213.4557756
 3205.46758844 3227.5214783  3224.56298089 3249.47577548 3263.72422886
 3254.08524107 3266.99554663 3264.21521282 3283.04488174 3301.79895674]


In [19]:
# Get the quote of the actual price for that day was 
SP500_quote2 = web.DataReader('^GSPC', data_source='yahoo', start='2020-01-21', end='2020-01-21')
print(SP500_quote2['Close'])

Date
2020-01-21    3320.790039
Name: Close, dtype: float64


> From the above, on the date (2020-01-21), the model predicted a S&P 500 Index Close price of 3,116.20, which gives an error of 204.58 (~6.16%) from the actual Close price of 3,320.79.
>
> Linear regression is a simple technique and fairly easy to interpret, however there are a few significant disadvantages of using linear regression with time series such as in using regression algorithms, the model tend not to take into account the previous values from the point of prediction, the model will consider the value from the same date or the same date/month a year ago. Hence, the predictions may not be accurate despite the model having a 83.6% accuracy score. 
>
> Therefore, I will explore other machine learning methods such as Auto Regressive Inteegration Moving Average (ARIMA) Model in the next notebook "3.2_Model_ARIMA"