Getting and Preparing the Data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv("five_minute.csv", parse_dates = ["time"], index_col = "time")

In [5]:
data["returns"] = np.log(data.div(data.shift(1)))

In [7]:
data.dropna(inplace = True)

In [11]:
data["direction"] = np.sign(data.returns)

In [13]:
data

Unnamed: 0_level_0,price,returns,direction
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 22:05:00+00:00,1.146350,-0.000201,-1.0
2019-01-01 22:10:00+00:00,1.146320,-0.000026,-1.0
2019-01-01 22:15:00+00:00,1.146320,0.000000,0.0
2019-01-01 22:20:00+00:00,1.146530,0.000183,1.0
2019-01-01 22:25:00+00:00,1.146475,-0.000048,-1.0
...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0


In [15]:
data

Unnamed: 0_level_0,price,returns,direction
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 22:05:00+00:00,1.146350,-0.000201,-1.0
2019-01-01 22:10:00+00:00,1.146320,-0.000026,-1.0
2019-01-01 22:15:00+00:00,1.146320,0.000000,0.0
2019-01-01 22:20:00+00:00,1.146530,0.000183,1.0
2019-01-01 22:25:00+00:00,1.146475,-0.000048,-1.0
...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0


In [17]:
data.direction.value_counts()

direction
 1.0    36058
-1.0    35702
 0.0     1959
Name: count, dtype: int64

In [19]:
lags = 5

In [21]:
cols = []
for lag in range(1, lags + 1):
    col = "lag{}".format(lag)
    data[col] = data.returns.shift(lag)
    cols.append(col)
data.dropna(inplace = True)

In [23]:
data

Unnamed: 0_level_0,price,returns,direction,lag1,lag2,lag3,lag4,lag5
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-01 22:30:00+00:00,1.146455,-0.000017,-1.0,-0.000048,0.000183,0.000000,-0.000026,-0.000201
2019-01-01 22:35:00+00:00,1.146455,0.000000,0.0,-0.000017,-0.000048,0.000183,0.000000,-0.000026
2019-01-01 22:40:00+00:00,1.146370,-0.000074,-1.0,0.000000,-0.000017,-0.000048,0.000183,0.000000
2019-01-01 22:45:00+00:00,1.146315,-0.000048,-1.0,-0.000074,0.000000,-0.000017,-0.000048,0.000183
2019-01-01 22:50:00+00:00,1.146475,0.000140,1.0,-0.000048,-0.000074,0.000000,-0.000017,-0.000048
...,...,...,...,...,...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0,-0.000112,-0.000018,0.000022,-0.000004,0.000089
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0,-0.000004,-0.000112,-0.000018,0.000022,-0.000004
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0,0.000027,-0.000004,-0.000112,-0.000018,0.000022
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0,0.000076,0.000027,-0.000004,-0.000112,-0.000018


The following:
Scale all lags in a way that the mean becomes 0 and the standart deviation becomes 1

In [26]:
means = data[cols].mean()
means

lag1   -3.142659e-07
lag2   -3.115388e-07
lag3   -3.125681e-07
lag4   -3.132864e-07
lag5   -3.159474e-07
dtype: float64

In [28]:
stand_devs = data[cols].std()
stand_devs

lag1    0.000199
lag2    0.000199
lag3    0.000199
lag4    0.000199
lag5    0.000199
dtype: float64

In [30]:
data[cols] = (data[cols] - means) / stand_devs
data

Unnamed: 0_level_0,price,returns,direction,lag1,lag2,lag3,lag4,lag5
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-01 22:30:00+00:00,1.146455,-0.000017,-1.0,-0.240031,0.924152,0.001574,-0.130230,-1.008816
2019-01-01 22:35:00+00:00,1.146455,0.000000,0.0,-0.086280,-0.240044,0.924158,0.001578,-0.130216
2019-01-01 22:40:00+00:00,1.146370,-0.000074,-1.0,0.001583,-0.086293,-0.240039,0.924162,0.001591
2019-01-01 22:45:00+00:00,1.146315,-0.000048,-1.0,-0.371850,0.001569,-0.086288,-0.240035,0.924169
2019-01-01 22:50:00+00:00,1.146475,0.000140,1.0,-0.240065,-0.371862,0.001574,-0.086284,-0.240020
...,...,...,...,...,...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0,-0.560410,-0.088343,0.113965,-0.020901,0.451175
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0,-0.020898,-0.560420,-0.088338,0.113969,-0.020887
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0,0.136467,-0.020912,-0.560415,-0.088335,0.113981
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0,0.383736,0.136453,-0.020907,-0.560412,-0.088321


In [32]:
data[cols].mean()

lag1    1.041032e-17
lag2    1.542269e-18
lag3   -1.118145e-17
lag4    5.397942e-18
lag5   -1.079588e-17
dtype: float64

In [34]:
data[cols].std()

lag1    1.0
lag2    1.0
lag3    1.0
lag4    1.0
lag5    1.0
dtype: float64

////////////////////////////////

Predicting Market Direction with Logistic Regression

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [72]:
lm = OneVsRestClassifier(LogisticRegression(C = 1e6, max_iter = 100000))

In [74]:
lm.fit(data[cols], data.direction)

In [76]:
data["pred"] = lm.predict(data[cols])

In [78]:
data

Unnamed: 0_level_0,price,returns,direction,lag1,lag2,lag3,lag4,lag5,pred
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-01 22:30:00+00:00,1.146455,-0.000017,-1.0,-0.240031,0.924152,0.001574,-0.130230,-1.008816,1.0
2019-01-01 22:35:00+00:00,1.146455,0.000000,0.0,-0.086280,-0.240044,0.924158,0.001578,-0.130216,1.0
2019-01-01 22:40:00+00:00,1.146370,-0.000074,-1.0,0.001583,-0.086293,-0.240039,0.924162,0.001591,-1.0
2019-01-01 22:45:00+00:00,1.146315,-0.000048,-1.0,-0.371850,0.001569,-0.086288,-0.240035,0.924169,1.0
2019-01-01 22:50:00+00:00,1.146475,0.000140,1.0,-0.240065,-0.371862,0.001574,-0.086284,-0.240020,1.0
...,...,...,...,...,...,...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,-0.000004,-1.0,-0.560410,-0.088343,0.113965,-0.020901,0.451175,1.0
2019-12-30 23:40:00+00:00,1.120210,0.000027,1.0,-0.020898,-0.560420,-0.088338,0.113969,-0.020887,1.0
2019-12-30 23:45:00+00:00,1.120295,0.000076,1.0,0.136467,-0.020912,-0.560415,-0.088335,0.113981,1.0
2019-12-30 23:50:00+00:00,1.120275,-0.000018,-1.0,0.383736,0.136453,-0.020907,-0.560412,-0.088321,-1.0


In [80]:
data.pred.value_counts()

pred
 1.0    41414
-1.0    32300
Name: count, dtype: int64

In [82]:
hits = np.sign(data.direction * data.pred).value_counts()

In [84]:
hits

 1.0    37636
-1.0    34120
 0.0     1958
Name: count, dtype: int64

In [86]:
hit_ratio = hits[1.0] / sum(hits)
hit_ratio

0.5105678704181024

In [88]:
from sklearn.metrics import accuracy_score

In [90]:
accuracy_score(y_true = data.direction, y_pred = data.pred)

0.5105678704181024