In [1]:
!pip install yfinance --quiet

[?25l[K     |█████▏                          | 10 kB 13.3 MB/s eta 0:00:01[K     |██████████▍                     | 20 kB 16.4 MB/s eta 0:00:01[K     |███████████████▋                | 30 kB 19.8 MB/s eta 0:00:01[K     |████████████████████▉           | 40 kB 23.0 MB/s eta 0:00:01[K     |██████████████████████████      | 51 kB 10.3 MB/s eta 0:00:01[K     |███████████████████████████████▎| 61 kB 11.6 MB/s eta 0:00:01[K     |████████████████████████████████| 62 kB 572 kB/s 
[?25h

### Predicting Market Direction
---

In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
import statsmodels.api as sm

  import pandas.util.testing as tm


In [40]:
sensex_data = yf.download("^BSESN", start='2001-01-03', end='2005-12-31')
sensex_data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-03,3977.580078,4067.659912,3977.580078,4060.020020,4060.020020,0
2001-01-04,4180.970215,4180.970215,4109.549805,4115.370117,4115.370117,0
2001-01-05,4116.339844,4195.009766,4115.350098,4183.729980,4183.729980,0
2001-01-08,4164.759766,4206.720215,4101.529785,4120.430176,4120.430176,0
2001-01-09,4114.740234,4166.839844,4101.009766,4125.310059,4125.310059,0
...,...,...,...,...,...,...
2005-12-26,9254.089844,9262.480469,9050.509766,9085.889648,9085.889648,15400
2005-12-27,9071.250000,9297.070312,9020.799805,9283.160156,9283.160156,14600
2005-12-28,9292.179688,9350.820312,9207.269531,9257.509766,9257.509766,19600
2005-12-29,9272.629883,9338.110352,9271.629883,9323.250000,9323.250000,14200


In [42]:
# Calculate daily returns data
df = sensex_data['Adj Close'].pct_change()*100
df

Date
2001-01-03         NaN
2001-01-04    1.363296
2001-01-05    1.661087
2001-01-08   -1.512999
2001-01-09    0.118431
                ...   
2005-12-26   -1.847490
2005-12-27    2.171174
2005-12-28   -0.276311
2005-12-29    0.710129
2005-12-30    0.801005
Name: Adj Close, Length: 1245, dtype: float64

In [43]:
df = df.rename("Today").reset_index()
df

Unnamed: 0,Date,Today
0,2001-01-03,
1,2001-01-04,1.363296
2,2001-01-05,1.661087
3,2001-01-08,-1.512999
4,2001-01-09,0.118431
...,...,...
1240,2005-12-26,-1.847490
1241,2005-12-27,2.171174
1242,2005-12-28,-0.276311
1243,2005-12-29,0.710129


In [44]:
# Calculate the Lags
for i in range(1, 6):
    df['Lag_' + str(i)] = df['Today'].shift(i)

df

Unnamed: 0,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5
0,2001-01-03,,,,,,
1,2001-01-04,1.363296,,,,,
2,2001-01-05,1.661087,1.363296,,,,
3,2001-01-08,-1.512999,1.661087,1.363296,,,
4,2001-01-09,0.118431,-1.512999,1.661087,1.363296,,
...,...,...,...,...,...,...,...
1240,2005-12-26,-1.847490,-1.231178,0.354741,-0.075649,-0.511262,1.182724
1241,2005-12-27,2.171174,-1.847490,-1.231178,0.354741,-0.075649,-0.511262
1242,2005-12-28,-0.276311,2.171174,-1.847490,-1.231178,0.354741,-0.075649
1243,2005-12-29,0.710129,-0.276311,2.171174,-1.847490,-1.231178,0.354741


In [45]:
# Lets get the volume of the prior day
# Divide by 1,000,000,000 to scale
df['Volume'] = sp500_data.Volume.shift(1).values/1000000000

In [46]:
df = df.dropna()
df

Unnamed: 0,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Volume
6,2001-01-11,-0.506715,-1.882772,0.118431,-1.512999,1.661087,1.363296,0.000000
7,2001-01-12,0.234663,-0.506715,-1.882772,0.118431,-1.512999,1.661087,0.000000
8,2001-01-15,0.252192,0.234663,-0.506715,-1.882772,0.118431,-1.512999,0.000000
9,2001-01-16,0.592325,0.252192,0.234663,-0.506715,-1.882772,0.118431,0.000000
10,2001-01-17,0.311490,0.592325,0.252192,0.234663,-0.506715,-1.882772,0.000000
...,...,...,...,...,...,...,...,...
1240,2005-12-26,-1.847490,-1.231178,0.354741,-0.075649,-0.511262,1.182724,0.000016
1241,2005-12-27,2.171174,-1.847490,-1.231178,0.354741,-0.075649,-0.511262,0.000015
1242,2005-12-28,-0.276311,2.171174,-1.847490,-1.231178,0.354741,-0.075649,0.000015
1243,2005-12-29,0.710129,-0.276311,2.171174,-1.847490,-1.231178,0.354741,0.000020


In [47]:
# Now add in the direction
# 1 for up and 0 for down
df['Direction'] = [1 if i > 0 else 0 for i in df['Today']]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Volume,Direction
6,2001-01-11,-0.506715,-1.882772,0.118431,-1.512999,1.661087,1.363296,0.000000,0
7,2001-01-12,0.234663,-0.506715,-1.882772,0.118431,-1.512999,1.661087,0.000000,1
8,2001-01-15,0.252192,0.234663,-0.506715,-1.882772,0.118431,-1.512999,0.000000,1
9,2001-01-16,0.592325,0.252192,0.234663,-0.506715,-1.882772,0.118431,0.000000,1
10,2001-01-17,0.311490,0.592325,0.252192,0.234663,-0.506715,-1.882772,0.000000,1
...,...,...,...,...,...,...,...,...,...
1240,2005-12-26,-1.847490,-1.231178,0.354741,-0.075649,-0.511262,1.182724,0.000016,0
1241,2005-12-27,2.171174,-1.847490,-1.231178,0.354741,-0.075649,-0.511262,0.000015,1
1242,2005-12-28,-0.276311,2.171174,-1.847490,-1.231178,0.354741,-0.075649,0.000015,0
1243,2005-12-29,0.710129,-0.276311,2.171174,-1.847490,-1.231178,0.354741,0.000020,1


In [48]:
# Add a constant so that sm_api provides an intercept
df = sm.add_constant(df)
df

  x = pd.concat(x[::order], 1)


Unnamed: 0,const,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Volume,Direction
6,1.0,2001-01-11,-0.506715,-1.882772,0.118431,-1.512999,1.661087,1.363296,0.000000,0
7,1.0,2001-01-12,0.234663,-0.506715,-1.882772,0.118431,-1.512999,1.661087,0.000000,1
8,1.0,2001-01-15,0.252192,0.234663,-0.506715,-1.882772,0.118431,-1.512999,0.000000,1
9,1.0,2001-01-16,0.592325,0.252192,0.234663,-0.506715,-1.882772,0.118431,0.000000,1
10,1.0,2001-01-17,0.311490,0.592325,0.252192,0.234663,-0.506715,-1.882772,0.000000,1
...,...,...,...,...,...,...,...,...,...,...
1240,1.0,2005-12-26,-1.847490,-1.231178,0.354741,-0.075649,-0.511262,1.182724,0.000016,0
1241,1.0,2005-12-27,2.171174,-1.847490,-1.231178,0.354741,-0.075649,-0.511262,0.000015,1
1242,1.0,2005-12-28,-0.276311,2.171174,-1.847490,-1.231178,0.354741,-0.075649,0.000015,0
1243,1.0,2005-12-29,0.710129,-0.276311,2.171174,-1.847490,-1.231178,0.354741,0.000020,1


In [49]:
# Independent vars
X = df[['const', 'Lag_1','Lag_2','Lag_3','Lag_4','Lag_5','Volume']]

In [50]:
# Dependent variable is market diretion
y = df.Direction

In [51]:
y

6       0
7       1
8       1
9       1
10      1
       ..
1240    0
1241    1
1242    0
1243    1
1244    1
Name: Direction, Length: 1239, dtype: int64

In [52]:
# Define a Logit model
model = sm.Logit(y, X)
# Fit the model
result = model.fit()

         Current function value: 10.380282
         Iterations: 35




In [53]:
result.summary()

0,1,2,3
Dep. Variable:,Direction,No. Observations:,1239.0
Model:,Logit,Df Residuals:,1232.0
Method:,MLE,Df Model:,6.0
Date:,"Sat, 06 Aug 2022",Pseudo R-squ.:,-14.08
Time:,15:33:29,Log-Likelihood:,-12861.0
converged:,False,LL-Null:,-853.09
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,17.9823,2.664,6.750,0.000,12.761,23.204
Lag_1,5.9353,0.936,6.338,0.000,4.100,7.771
Lag_2,0.3401,0.225,1.513,0.130,-0.101,0.781
Lag_3,3.2967,0.518,6.361,0.000,2.281,4.313
Lag_4,4.5717,0.716,6.389,0.000,3.169,5.974
Lag_5,0.9593,0.195,4.924,0.000,0.577,1.341
Volume,-1.782e+06,2.69e+05,-6.623,0.000,-2.31e+06,-1.25e+06


None of the Lags are statistically significant as per p-values

In [54]:
# Predict if the market is going up or down
mkt_dir_prediction = result.predict(X)

In [55]:
mkt_dir_prediction

6       9.999788e-01
7       9.999174e-01
8       9.999944e-01
9       9.999168e-01
10      1.000000e+00
            ...     
1240    6.708951e-09
1241    1.226732e-09
1242    8.464188e-01
1243    2.011023e-13
1244    3.410503e-03
Length: 1239, dtype: float64

In [56]:
# Did our model perform well?
def confusion_matrix(act, pred):
    # In Logistic regresion a prediction > 0.5 is rounded up to 1
    # predicted_values_transform = ['Up' if i > 0.5 else 'Down' for i in pred]
    # actual_values = ['Up' if i > 0 else 'Down' for i in act]
    predicted_values_transform = np.where(pred > 0.5, 'Up', 'Down')
    actual_values = np.where(act > 0, 'Up', 'Down')
    # Convert dataframe to series for this to work
    confusion_matrix = pd.crosstab(pd.Series(actual_values.squeeze()), 
                                    pd.Series(predicted_values_transform.squeeze()),
                                    rownames=['Actual'],
                                    colnames=['Predicted'])
    return confusion_matrix

In [57]:
y.ndim

1

In [58]:
mkt_dir_prediction.ndim

1

In [59]:
confusion_matrix(y, mkt_dir_prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,242,318
Up,316,363


In [60]:
len(df)

1239

In [61]:
# Cprrect prediction divided by total number of observations
(155+504)/1250

0.5272

While it seems as though we are doing better than a coin toss, we are training and testing the model on the same data!


#### Split into Train and test datasets

Train data pre-2005. Test data in 2005

In [62]:
X_train = df[df.Date.dt.year < 2005][['const', 'Lag_1','Lag_2','Lag_3','Lag_4','Lag_5','Volume']]
y_train = df[df.Date.dt.year < 2005][['Direction']]
X_test = df[df.Date.dt.year == 2005][['const', 'Lag_1','Lag_2','Lag_3','Lag_4','Lag_5','Volume']]
y_test = df[df.Date.dt.year == 2005][['Direction']]

In [63]:
y_test

Unnamed: 0,Direction
997,1
998,0
999,0
1000,0
1001,1
...,...
1240,0
1241,1
1242,0
1243,1


In [64]:
model = sm.Logit(y_train, X_train)

In [65]:
result = model.fit()

         Current function value: 12.260159
         Iterations: 35




In [66]:
prediction = result.predict(X_test).to_frame()
prediction

Unnamed: 0,0
997,1.791763e-08
998,5.118998e-09
999,1.069304e-13
1000,1.810188e-20
1001,1.588592e-18
...,...
1240,8.715514e-14
1241,1.744850e-13
1242,1.039613e-10
1243,1.255850e-17


In [67]:
print(y_test.ndim)
print(prediction.ndim)

2
2


In [68]:
print(y_test.squeeze().ndim)
print(prediction.squeeze().ndim)

1
1


In [69]:
confusion_matrix(y_test, prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,107,0
Up,140,1


In [70]:
# How does this perform?
(81+38)/len(X_test)

0.4798387096774194

In [71]:
# What happens if you drop less relevant variables?
X_train = df[df.Date.dt.year < 2005][['const', 'Lag_1','Lag_2']]
y_train = df[df.Date.dt.year < 2005][['Direction']]
X_test = df[df.Date.dt.year == 2005][['const', 'Lag_1','Lag_2']]
y_test = df[df.Date.dt.year == 2005][['Direction']]

In [72]:
model = sm.Logit(y_train, X_train)
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.684076
         Iterations 4


In [73]:
prediction = result.predict(X_test)

In [74]:
confusion_matrix(y_test, prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,16,91
Up,15,126


In [75]:
(40 + 104)/len(X_test)

0.5806451612903226

#### We do better off by only taking the returns on the previous two days!