<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Python for Algorithmic Trading 

**Chapter 05 &mdash; Predicting Market Movements with Machine Learning**

## Using Linear Regression for Market Movement Prediction

### A Quick Review of Linear Regression

In [None]:
import numpy as np

In [None]:
%matplotlib inline
from pylab import mpl, plt
plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'serif'

In [None]:
x = np.linspace(0, 10)

In [None]:
np.random.seed(100)

In [None]:
y = x + np.random.standard_normal(len(x))

In [None]:
reg = np.polyfit(x, y, deg=1)

In [None]:
reg

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'bo', label='data')
plt.plot(x, np.polyval(reg, x), 'r', lw=2.5,
         label='linear regression')
plt.legend(loc=0);
# plt.savefig('../../images/ch05/lr_plot_1.png')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'bo', label='data')
xn = np.linspace(0, 20)
plt.plot(xn, np.polyval(reg, xn), 'r', lw=2.5,
         label='linear regression')
plt.legend(loc=0);
# plt.savefig('../../images/ch05/lr_plot_2.png')

### The Basic Idea for Price Prediction

In [None]:
x = np.arange(12)

In [None]:
x

In [None]:
lags = 3

In [None]:
m = np.zeros((lags + 1, len(x) - lags))

In [None]:
m[lags] = x[lags:]
for i in range(lags):
    m[i] = x[i:i - lags]

In [None]:
m.T

In [None]:
reg = np.linalg.lstsq(m[:lags].T, m[lags], rcond=None)[0]

In [None]:
reg

In [None]:
np.dot(m[:lags].T, reg)

### Predicting Index Levels

In [None]:
import pandas as pd

In [None]:
raw = pd.read_csv('http://hilpisch.com/tr_eikon_eod_data.csv',
                  index_col=0, parse_dates=True).dropna()

In [None]:
data = pd.DataFrame(raw['.SPX'])

In [None]:
data.rename(columns={'.SPX': 'price'}, inplace=True)

In [None]:
lags = 3

In [None]:
cols = []
for lag in range(1, lags + 1):
    col = f'lag_{lag}'
    data[col] = data['price'].shift(lag) # <1>
    cols.append(col)
data.dropna(inplace=True)

In [None]:
reg = np.linalg.lstsq(data[cols], data['price'],
                      rcond=None)[0]

In [None]:
reg

In [None]:
data['prediction'] = np.dot(data[cols], reg)

In [None]:
data[['price', 'prediction']].plot(figsize=(10, 6));
# plt.savefig('../../images/ch05/lr_plot_3.png')

In [None]:
data[['price', 'prediction']].loc['2018-3-1':].plot(
            figsize=(10, 6));
# plt.savefig('../../images/ch05/lr_plot_4.png')

### Predicting Future Returns

In [None]:
data['returns'] = np.log(data['price'] /
                         data['price'].shift(1))

In [None]:
data.dropna(inplace=True)

In [None]:
cols = []
for lag in range(1, lags + 1):
    col = f'lag_{lag}'
    data[col] = data['returns'].shift(lag) # <3>
    cols.append(col)
data.dropna(inplace=True)

In [None]:
reg = np.linalg.lstsq(data[cols], data['returns'],
                      rcond=None)[0]

In [None]:
reg

In [None]:
data['prediction'] = np.dot(data[cols], reg)

In [None]:
data[['returns', 'prediction']].iloc[lags:].plot(figsize=(10, 6));
# plt.savefig('../../images/ch05/lr_plot_5.png')

In [None]:
hits = np.sign(data['returns'] *
               data['prediction']).value_counts()

In [None]:
hits

In [None]:
hits.values[0] / sum(hits)

### Prediction Future Market Direction

In [None]:
reg = np.linalg.lstsq(data[cols], np.sign(data['returns']),
                      rcond=None)[0]

In [None]:
reg

In [None]:
data['prediction'] = np.sign(np.dot(data[cols], reg))

In [None]:
data['prediction'].value_counts()

In [None]:
hits = np.sign(data['returns'] *
               data['prediction']).value_counts()

In [None]:
hits

In [None]:
hits.values[0] / sum(hits)

### Vectorized Backtesting of Regression-based Strategy 

In [None]:
data.head()

In [None]:
data['strategy'] = data['prediction'] * data['returns']

In [None]:
data[['returns', 'strategy']].dropna().cumsum(
        ).apply(np.exp).plot(figsize=(10, 6));
# plt.savefig('../../images/ch05/lr_plot_6.png')

### Generalizing the Approach

In [None]:
import LRVectorBacktester as LR

In [None]:
lrbt = LR.LRVectorBacktester('.SPX', '2010-1-1', '2018-06-29',
                                     10000, 0.0)

In [None]:
lrbt.run_strategy('2010-1-1', '2018-01-01',
                  '2010-1-1', '2018-01-01', lags=3)

In [None]:
lrbt.run_strategy('2010-1-1', '2015-12-31',
                  '2015-1-1', '2018-01-01', lags=3)

In [None]:
lrbt.plot_results()
# plt.savefig('../../images/ch05/lr_plot_7.png')

In [None]:
lrbt = LR.LRVectorBacktester('GDX', '2010-1-1', '2018-06-29',
                                     10000, 0.001)

In [None]:
lrbt.run_strategy('2010-1-1', '2015-12-31',
                  '2016-1-1', '2018-06-29', lags=5)

In [None]:
lrbt.plot_results()
# plt.savefig('../../images/ch05/lr_plot_8.png')

## Using Machine Learning for Market Movement Prediction

### Linear Regression with scikit-learn

In [None]:
x = np.arange(12)

In [None]:
x

In [None]:
lags = 3

In [None]:
m = np.zeros((lags + 1, len(x) - lags))

In [None]:
m[lags] = x[lags:]
for i in range(lags):
    m[i] = x[i:i - lags]

In [None]:
from sklearn import linear_model

In [None]:
lm = linear_model.LinearRegression()

In [None]:
lm.fit(m[:lags].T, m[lags])

In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
lm.predict(m[:lags].T)

In [None]:
lm = linear_model.LinearRegression(fit_intercept=False)

In [None]:
lm.fit(m[:lags].T, m[lags])

In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
lm.predict(m[:lags].T)

### A Simple Classification Problem

In [None]:
hours = np.array([0.5, 0.75, 1., 1.25, 1.5, 1.75, 1.75, 2.,
                  2.25, 2.5, 2.75, 3., 3.25, 3.5, 4., 4.25,
                  4.5, 4.75, 5., 5.5])

In [None]:
success = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
                    0, 1, 1, 1, 1, 1, 1])

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(hours, success, 'ro')
plt.ylim(-0.2, 1.2);
# plt.savefig('../../images/ch05/ml_plot_1.png')

In [None]:
reg = np.polyfit(hours, success, deg=1)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(hours, success, 'ro')
plt.plot(hours, np.polyval(reg, hours), 'b')
plt.ylim(-0.2, 1.2);
# plt.savefig('../../images/ch05/ml_plot_2.png')

In [None]:
lm = linear_model.LogisticRegression(solver='lbfgs')

In [None]:
hrs = hours.reshape(1, -1).T

In [None]:
lm.fit(hrs, success)

In [None]:
prediction = lm.predict(hrs)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(hours, success, 'ro', label='data')
plt.plot(hours, prediction, 'b', label='prediction')
plt.legend(loc=0)
plt.ylim(-0.2, 1.2);
# plt.savefig('../../images/ch05/ml_plot_3.png')

In [None]:
prob = lm.predict_proba(hrs)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(hours, success, 'ro')
plt.plot(hours, prediction, 'b')
plt.plot(hours, prob.T[0], 'm--',
         label='$p(h)$ for zero')
plt.plot(hours, prob.T[1], 'g-.',
         label='$p(h)$ for one')
plt.ylim(-0.2, 1.2)
plt.legend(loc=0)
# plt.savefig('../../images/ch05/ml_plot_4.png')

### Using Logistic Regression to Predict Market Direction

In [None]:
data = pd.read_csv('http://hilpisch.com/tr_eikon_eod_data.csv',
                  index_col=0, parse_dates=True).dropna()

In [None]:
data = pd.DataFrame(data['.SPX'])

In [None]:
data.rename(columns={'.SPX': 'price'}, inplace=True)

In [None]:
data['returns'] = np.log(data['price'] / data['price'].shift(1))

In [None]:
data.dropna(inplace=True)

In [None]:
lags = 3

In [None]:
cols = []
for lag in range(1, lags + 1):
    col = 'lag_{}'.format(lag)
    data[col] = data['returns'].shift(lag)
    cols.append(col)

In [None]:
data.dropna(inplace=True)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
lm = linear_model.LogisticRegression(C=1e6, solver='lbfgs',
                                     multi_class='auto')

In [None]:
lm.fit(data[cols], np.sign(data['returns']))

In [None]:
data['prediction'] = lm.predict(data[cols])

In [None]:
data['prediction'].value_counts()

In [None]:
hits = np.sign(data['returns'].iloc[lags:] *
               data['prediction'].iloc[lags:]
              ).value_counts()

In [None]:
hits

In [None]:
accuracy_score(data['prediction'],
               np.sign(data['returns']))

In [None]:
data['strategy'] = data['prediction'] * data['returns']

In [None]:
data[['returns', 'strategy']].cumsum().apply(np.exp).plot(
                                        figsize=(10, 6));
# plt.savefig('../../images/ch05/ml_plot_5.png')

In [None]:
lags = 15

In [None]:
cols = []
for lag in range(1, lags + 1):
    col = 'lag_%d' % lag
    data[col] = data['returns'].shift(lag)
    cols.append(col)

In [None]:
data.dropna(inplace=True)

In [None]:
lm.fit(data[cols], np.sign(data['returns']))

In [None]:
data['prediction'] = lm.predict(data[cols])

In [None]:
data['prediction'].value_counts()

In [None]:
hits = np.sign(data['returns'].iloc[lags:] *
               data['prediction'].iloc[lags:]
              ).value_counts()

In [None]:
hits

In [None]:
accuracy_score(data['prediction'],
               np.sign(data['returns']))

In [None]:
data['strategy'] = data['prediction'] * data['returns']

In [None]:
data[['returns', 'strategy']].cumsum().apply(np.exp).plot(
                                        figsize=(10, 6));
# plt.savefig('../../images/ch05/ml_plot_6.png')

### Generalizing the Approach

In [None]:
import ScikitVectorBacktester as SCI

In [None]:
scibt = SCI.ScikitVectorBacktester('.SPX',
                                   '2010-1-1', '2018-06-29',
                                   10000, 0.0, 'logistic')

In [None]:
scibt.run_strategy('2010-1-1', '2018-06-29',
                   '2010-1-1', '2018-06-29', lags=15)

In [None]:
scibt.run_strategy('2010-1-1', '2013-12-31',
                   '2014-1-1', '2018-06-29', lags=15)

In [None]:
scibt.plot_results()
# plt.savefig('../../images/ch05/ml_plot_7.png')

In [None]:
scibt = SCI.ScikitVectorBacktester('.SPX',
                                   '2010-1-1', '2018-06-29',
                                   10000, 0.001, 'logistic')

In [None]:
scibt.run_strategy('2010-1-1', '2013-12-31',
                   '2014-1-1', '2016-10-31', lags=15)

In [None]:
scibt.plot_results()
# plt.savefig('../../images/ch05/ml_plot_8.png')

## Using Deep Learning for Market Movement Prediction

#### The Simple Classification Problem Revisited 

In [None]:
hours = np.array([0.5, 0.75, 1., 1.25, 1.5, 1.75, 1.75, 2.,
                  2.25, 2.5, 2.75, 3., 3.25, 3.5, 4., 4.25,
                  4.5, 4.75, 5., 5.5])

In [None]:
success = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
                    0, 1, 1, 1, 1, 1, 1]) 

In [None]:
data = pd.DataFrame({'hours': hours, 'success': success})

In [None]:
data.info()

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier(hidden_layer_sizes=[32],
                     max_iter=1000)

In [None]:
model.fit(data['hours'].values.reshape(-1, 1), data['success'])

In [None]:
data['prediction'] = model.predict(data['hours'].values.reshape(-1, 1)) # <5>

In [None]:
data.tail()

In [None]:
data.plot(x='hours', y=['success', 'prediction'],
          style=['ro', 'b-'], ylim=[-.1, 1.1],
          figsize=(10, 6));
# plt.savefig('../../images/ch05/dl_plot_1.png')

### Using Deep Neural Networks to Predict Market Direction

In [None]:
data = pd.read_csv('http://hilpisch.com/tr_eikon_eod_data.csv',
                  index_col=0, parse_dates=True).dropna()

In [None]:
data = pd.DataFrame(data['.SPX'])

In [None]:
data.rename(columns={'.SPX': 'price'}, inplace=True)

In [None]:
data['returns'] = np.log(data['price'] /
                         data['price'].shift(1)) 

In [None]:
data['direction'] = np.where(data['returns'] > 0, 1, 0)

In [None]:
lags = 5

In [None]:
cols = []
for lag in range(1, lags + 1): # <5>
    col = 'ret_{}'.format(lag)
    data[col] = data['direction'].shift(lag) # <6>
    cols.append(col)
data.dropna(inplace=True) # <7>
data[cols] = data[cols].astype(int)

In [None]:
print(data.round(4).tail())

In [None]:
np.random.seed(500)
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
tf.random.set_random_seed(500)
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu',
                input_shape=(lags,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # <5>
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
cutoff = '2015-1-1'

In [None]:
training_data = data[data.index < cutoff].copy()

In [None]:
test_data = data[data.index >= cutoff].copy()

In [None]:
%%time
model.fit(training_data[cols],
          training_data['direction'],
          epochs=100, verbose=False)

In [None]:
pred = model.predict_classes(training_data[cols])

In [None]:
pred[:30].flatten()

In [None]:
training_data['prediction'] = np.where(pred > 0, 1, -1)

In [None]:
training_data['strategy'] = (training_data['prediction'] *
                            training_data['returns'])

In [None]:
training_data[['returns', 'strategy']].cumsum(
                ).apply(np.exp).plot(figsize=(10, 6));
# plt.savefig('../../images/ch05/dl_plot_2.png')

In [None]:
pred = model.predict_classes(test_data[cols])

In [None]:
test_data['prediction'] = np.where(pred > 0, 1, -1)

In [None]:
test_data['strategy'] = (test_data['prediction'] *
                        test_data['returns'])

In [None]:
test_data[['returns', 'strategy']].cumsum(
                ).apply(np.exp).plot(figsize=(10, 6));
# plt.savefig('../../images/ch05/dl_plot_3.png')

### Adding Different Types of Features

In [None]:
def transform(x):
    if x < -0.0015: return -2
    elif x < 0: return -1
    elif x > 0.0015: return 2
    else: return 1

In [None]:
data['momentum'] = data['returns'].rolling(4).mean().shift(1)

In [None]:
data['momentum'] = data['momentum'].apply(transform)

In [None]:
data.dropna(inplace=True)

In [None]:
cols.append('momentum')

In [None]:
print(data.round(4).tail())

In [None]:
training_data = data[data.index < cutoff].copy()

In [None]:
test_data = data[data.index >= cutoff].copy()

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu',
                input_shape=(len(cols),)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
%%time 
model.fit(training_data[cols], training_data['direction'],
          verbose=False, epochs=100)

In [None]:
pred = model.predict_classes(training_data[cols])

In [None]:
training_data['prediction'] = np.where(pred > 0, 1, -1)

In [None]:
training_data['strategy'] = training_data['prediction'] * \
                            training_data['returns']

In [None]:
training_data[['returns', 'strategy']].cumsum(
                ).apply(np.exp).plot(figsize=(10, 6));
# plt.savefig('../../images/ch05/dl_plot_4.png')

In [None]:
pred = model.predict_classes(test_data[cols])

In [None]:
test_data['prediction'] = np.where(pred > 0, 1, -1)

In [None]:
test_data['strategy'] = (test_data['prediction'] *
                        test_data['returns'])

In [None]:
test_data[['returns', 'strategy']].cumsum(
                ).apply(np.exp).plot(figsize=(10, 6));
# plt.savefig('../../images/ch05/dl_plot_5.png')

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:training@tpq.io">training@tpq.io</a>