### 1. Analyze the Data

#### Import Libraries

In [1]:
import pandas as pd
import yfinance as yf
import statsmodels.api as sm

#### Data Preprocessing

In [2]:
data = pd.read_csv('Downloads/MSFT.csv')

In [3]:
df = data['Adj Close'].pct_change()*100

In [4]:
df = df.rename('Today')

In [5]:
df.index = pd.to_datetime(data['Date'])

In [6]:
df = df.reset_index()

In [7]:
df

Unnamed: 0,Date,Today
0,2017-05-19,
1,2017-05-22,1.122731
2,2017-05-23,0.336016
3,2017-05-24,0.131042
4,2017-05-25,1.236007
...,...,...
1254,2022-05-12,-1.995771
1255,2022-05-13,2.259639
1256,2022-05-16,0.145529
1257,2022-05-17,2.034420


In [8]:
for i in range (1,6):
    df['Lag '+str(i)]=df['Today'].shift(i)

In [13]:
df = df.dropna()

In [15]:
df['Direction'] = [1 if i>0 else 0 for i in df['Today']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Direction'] = [1 if i>0 else 0 for i in df['Today']]


In [17]:
df = sm.add_constant(df)

  x = pd.concat(x[::order], 1)


In [35]:
df.head()

Unnamed: 0,const,Date,Today,Lag 1,Lag 2,Lag 3,Lag 4,Lag 5,Direction
6,1.0,2017-05-30,0.643228,0.488381,1.236007,0.131042,0.336016,1.122731,1
7,1.0,2017-05-31,-0.809563,0.643228,0.488381,1.236007,0.131042,0.336016,0
8,1.0,2017-06-01,0.372288,-0.809563,0.643228,0.488381,1.236007,0.131042,1
9,1.0,2017-06-02,2.368064,0.372288,-0.809563,0.643228,0.488381,1.236007,1
10,1.0,2017-06-05,0.724597,2.368064,0.372288,-0.809563,0.643228,0.488381,1


### 2. Train the Model

Use Lags and the volume to predict the direction

In [19]:
x = df[['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5']]
#x = df[['const','Today','Volume']]

In [20]:
y = df.Direction

In [21]:
model = sm.Logit(y,x)

In [22]:
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.680884
         Iterations 4


In [23]:
result.summary()

0,1,2,3
Dep. Variable:,Direction,No. Observations:,1253.0
Model:,Logit,Df Residuals:,1247.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 07 Jun 2022",Pseudo R-squ.:,0.009627
Time:,13:04:19,Log-Likelihood:,-853.15
converged:,True,LL-Null:,-861.44
Covariance Type:,nonrobust,LLR p-value:,0.005354

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2424,0.059,4.135,0.000,0.128,0.357
Lag 1,-0.1319,0.034,-3.887,0.000,-0.198,-0.065
Lag 2,-0.0282,0.034,-0.826,0.409,-0.095,0.039
Lag 3,-0.0255,0.034,-0.749,0.454,-0.092,0.041
Lag 4,-0.0002,0.034,-0.006,0.995,-0.067,0.066
Lag 5,-0.0164,0.033,-0.499,0.618,-0.081,0.048


In [24]:
prediction = result.predict(x)

In [25]:
prediction

6       0.530291
7       0.526648
8       0.578407
9       0.544697
10      0.483023
          ...   
1254    0.688764
1255    0.638304
1256    0.536377
1257    0.545037
1258    0.491809
Length: 1253, dtype: float64

### 3. Test the Model

In [30]:
x_train = df[df.Date.dt.year < 2020][['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5']]
y_train = df[df.Date.dt.year < 2020]['Direction']
x_test = df[df.Date.dt.year > 2020][['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5']]
y_test = df[df.Date.dt.year > 2020]['Direction']

In [31]:
model = sm.Logit(y_train,x_train)

In [32]:
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.673898
         Iterations 4


### 4. Prediction Accuracy

####  Train set prediction accuracy

In [26]:
def confusion_matrix(act, pred):
    predtrans = ['Up' if i>0.5 else 'Down' for i in pred]
    actuals = ['Up' if i>0 else 'Down' for i in act]
    confusion_matrix = pd.crosstab(pd.Series(actuals),
                                   pd.Series(predtrans),
                                  rownames=['Actual'],
                                  colnames=['Predicted'])
    return confusion_matrix

In [27]:
confusion_matrix(y,prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,75,485
Up,67,626


In [28]:
len(df)

1253

In [29]:
Prediction_accuracy = (75+626)/len(df)
print(Prediction_accuracy)

0.5594573024740622


The accuracy of train set prediction: 55.9%

#### Test set prediction accuracy

In [33]:
confusion_matrix(y_test, prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,10,159
Up,13,165


In [67]:
len(x_test)

347

In [34]:
#Prediction_accuracy
(10+165)/len(x_test)

0.5043227665706052

The accuracy of test set prediction: 50.4%