Baseline Model: Linear Regression

In [3]:
# This program predicts stock prices by using machine learning models

# Import the libraries
import quandl
import numpy as np
import pandas as pd
import pandas_datareader as web
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
df = pd.read_csv('../combined_twitter_index.csv', sep='\t')

In [6]:
df.drop(columns = ['Unnamed: 0'], axis=1, inplace = True)

In [12]:
# Preprocess data
df.index = pd.DatetimeIndex(df['Date'])
df.sort_index(ascending=True, inplace=True)
df.drop('Date', axis=1, inplace=True)
df=df.loc[:'2020-01-20',:]

KeyError: 'Date'

In [13]:
# Get the Close Price based on Trump Twitter Index
df = df[['Close_TTI']]

#Take a look at the new data
print(df.tail())

              Close_TTI
Date                   
2017-01-13  2272.732700
2017-01-17  2269.133968
2017-01-18  2269.142404
2017-01-19  2271.880591
2017-01-20  2269.954807


In [14]:
# A variable for predicting n=30 days out into the future
forecast_out = 30 # n=30 days

# Create another column (the target or dependent variable) shifted 'n' units up
df['Prediction'] = df[['Close_TTI']].shift(-forecast_out)

# Print the new data set
print(df.tail())

              Close_TTI  Prediction
Date                               
2017-01-13  2272.732700         NaN
2017-01-17  2269.133968         NaN
2017-01-18  2269.142404         NaN
2017-01-19  2271.880591         NaN
2017-01-20  2269.954807         NaN


In [15]:
### Create the independent data set (X)  #######
# Convert the dataframe to a numpy array
X = np.array(df.drop(['Prediction'],1))

#Remove the last 'n' rows
X = X[:-forecast_out]
print(X)

[[ 848.04021458]
 [ 805.51514863]
 [ 839.81898499]
 ...
 [2191.12066044]
 [2200.6514248 ]
 [2207.26178374]]


  X = np.array(df.drop(['Prediction'],1))


In [16]:
### Create the dependent data set (y)  #####
# Convert the dataframe to a numpy array (All of the values including the NaN's)
y = np.array(df['Prediction'])

# Get all of the y values except the last n=10 rows
y = y[:-forecast_out]
print(y)

[ 699.5832943   706.79145964  683.95473215 ... 2269.14240438 2271.88059079
 2269.95480707]


In [17]:
# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [19]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1588, 1), (398, 1), (1588,), (398,))

In [20]:
# Set x_forecast equal to the last n=30 rows of the original data set from Close_TTI column
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]
print(x_forecast)

[[2210.98721516]
 [2241.10066186]
 [2249.73207552]
 [2258.82249402]
 [2263.34782962]
 [2268.31398804]
 [2253.74511012]
 [2266.80620094]
 [2259.2627658 ]
 [2266.52353403]
 [2270.54427591]
 [2262.92750767]
 [2260.25316604]
 [2266.23043504]
 [2270.13146734]
 [2249.49961973]
 [2251.57826668]
 [2251.57774895]
 [2261.54939322]
 [2268.17965316]
 [2271.12591303]
 [2273.61906259]
 [2269.71520053]
 [2268.59879454]
 [2271.13825658]
 [2272.73269973]
 [2269.13396836]
 [2269.14240438]
 [2271.88059079]
 [2269.95480707]]


  x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]


Train a Linear regression model on training set


In [21]:
# Create and train the Linear Regression  Model
linreg = LinearRegression()

#Cross-validation of train test split
print(cross_val_score(linreg, X_train, y_train, cv=3))
print(cross_val_score(linreg, X_train, y_train, cv=3).mean())

[0.97371277 0.97591757 0.97375501]
0.974461781976408


In [22]:
# Create and train the Linear Regression  Model
linreg = LinearRegression()

# Train/Fit a Linear Regression Model onto the training set
linreg.fit(X_train, y_train)


LinearRegression()

In [23]:
# Predict on training set
y_pred_train = linreg.predict(X_train)

# Predict on test set
predictions = linreg.predict(X_test)

In [24]:
# Evaluating Performance Measures on training set

# Score returns the coefficient of determination R^2 (confidence) of the prediction. The best possible score is 1.0
score_train = linreg.score(X_train,y_train)
print ('R–Squared Score: {:4f}'.format(score_train))

trainScore = mean_squared_error(X_train, y_train)
print('Train Score: %.4f MSE (%.4f RMSE)' % (trainScore, math.sqrt(trainScore)))


R–Squared Score: 0.974499
Train Score: 4865.3207 MSE (69.7519 RMSE)


In [25]:
# Evaluating Performance Measures on validation (test) set

# Score returns the coefficient of determination R^2 (confidence) of the prediction. The best possible score is 1.0
score_test = linreg.score(X_test,y_test)
print ('R–Squared Score: {:4f}'.format(score_test))

testScore = mean_squared_error(X_test, y_test)
print('Test Score: %.8f MSE (%.8f RMSE)' % (testScore, math.sqrt(testScore)))

R–Squared Score: 0.974894
Test Score: 4791.02025730 MSE (69.21719625 RMSE)


Note that our baseline is: RMSE 69.21

From the above train and test scores, as the train score (107.63) is lower than the test (108.84) score, this implies that the the Linear Regression model is not overfit. The training performance and testing performance are relatively close which indicates some adjustments may be needed to be made to the model.

The above model R–Squared score of 0.836307 shows that the model has 83.6% accuracy rate in predicting the values. This is expected as predicting market prices is a difficult task as there are many other individual factors that may affect stock returns such as interest rates, inflation expectations, and other idiosyncratic factors.

Evaluation Metrics: R-squared value and Root Mean Squared Error (RMSE) are two metrics that can be used to evaluate the performance of the model. R-Squared is used for measuring the model accuracy. RMSE measures the average magnitude of the residuals or error. Ideally, lower RMSE and higher R-squared values are indicative of a good model.