In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

# Stock Market

In [3]:
smarket = pd.read_csv("smarketcsv.csv")
smarket.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [4]:
# We need to numericalize the Direction column so that we can have a look
# at the correlation
smarket['Up'] = np.where(smarket['Direction'] == 'Up', 1, 0)
smarket.corr()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Up
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095,0.074608
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155,-0.039757
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025,-0.024081
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448,0.006132
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069,0.004215
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486,0.005423
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592,0.022951
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0,0.730563
Up,0.074608,-0.039757,-0.024081,0.006132,0.004215,0.005423,0.022951,0.730563,1.0


An interesthing yet obvious thing to note over here is that there is very less correlation between the Lags and Today columns. This is to no one's surprise since today's returns do not depend on previous days' returns

In [8]:
X = smarket[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5']].values
Y = smarket["Up"].values
Y

array([1, 1, 0, ..., 1, 0, 0])

In [9]:
#first we need to make sure that the desired data lies between 2001 and 2005
#so as to segregate the test and training data
train_bool = smarket["Year"].values < 2005
X_train = X[train_bool]
X_test = X[~train_bool]
Y_train = Y[train_bool]
Y_test = Y[~train_bool]

In [11]:
#now we will fit the logistic regression model in order to predict the
#direction using the attributes of the X dataset.
results = smf.logit("Up ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume", data = smarket).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4


0,1,2,3
Dep. Variable:,Up,No. Observations:,1250.0
Model:,Logit,Df Residuals:,1243.0
Method:,MLE,Df Model:,6.0
Date:,"Mon, 16 Sep 2019",Pseudo R-squ.:,0.002074
Time:,11:02:59,Log-Likelihood:,-863.79
converged:,True,LL-Null:,-865.59
Covariance Type:,nonrobust,LLR p-value:,0.7319

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1260,0.241,-0.523,0.601,-0.598,0.346
Lag1,-0.0731,0.050,-1.457,0.145,-0.171,0.025
Lag2,-0.0423,0.050,-0.845,0.398,-0.140,0.056
Lag3,0.0111,0.050,0.222,0.824,-0.087,0.109
Lag4,0.0094,0.050,0.187,0.851,-0.089,0.107
Lag5,0.0103,0.050,0.208,0.835,-0.087,0.107
Volume,0.1354,0.158,0.855,0.392,-0.175,0.446


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [15]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
#let us have a look at the confusion matrix
confusion_matrix(Y_test, lr.predict(X_test))

array([[ 37,  74],
       [ 31, 110]])

In [19]:
147/ len(Y_test)

0.5833333333333334

Out of the 68 predicted down, 37 actually were down days. 54% accurracy

Out of the 184 predicted up, 110 actually were up. 60% accuracy.

58% total accuracy

In [21]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [22]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
confusion_matrix(Y_test, lda.predict(X_test))

array([[ 37,  74],
       [ 30, 111]])

The Linear Discriminant Analysis performs nearly similar to the Logistic Regression

In [23]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, Y_train)
confusion_matrix(Y_test, qda.predict(X_test))

array([[ 37,  74],
       [ 35, 106]])

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
confusion_matrix(Y_test, knn.predict(X_test))


array([[46, 65],
       [62, 79]])