1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import sklearn
from scipy.stats import loguniform, randint, uniform
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import make_scorer

2. Define functions to load data, create features, create target, and scoring function.

In [2]:
def create_features(df):
    df['Spread'] = df['High'] - df['Low']
    df['Gap'] = df['Open'] - df['Close'].shift(1)
    df['Intraday'] = df['Open'] - df['Close']
    return df

def drop_features(df):
    df.drop(columns=['Spread',
                     'Gap',
                     'Intraday',
                     ],
            inplace=True)

    df.drop(columns=['Open','High','Low','Close','Volume','Adj Close',
                     ], inplace=True)
    return df

def process_features(df, lookback, step):
    for i in range(step, lookback+1, step):
        df['%d Spread' % (i)] = df['Spread'].pct_change(periods=i, fill_method=None)
        df['%d Rolling Avg Spread' % (i)] = df['Spread'].rolling(window=i).mean()

        df['%d Gap' % (i)] = df['Gap'].pct_change(periods=i, fill_method=None)
        df['%d Rolling Avg Gap' % (i)] = df['Gap'].rolling(window=i).mean()

        df['%d Intraday' % (i)] = df['Intraday'].pct_change(periods=i, fill_method=None)
        df['%d Rolling Avg Intraday' % (i)] = df['Intraday'].rolling(window=i).mean()
    return df

def features(df, lookback, step):
    create_features(df)
    process_features(df, lookback, step)
    drop_features(df)
    return df

def create_target(df, lookforward=2, target='Open'):
    df['Target'] = np.log(df[target].shift(periods=-lookforward)/df[target].shift(periods=-1))
    return df

def custom_score(y_true, y_pred):
  pred_sign = np.sign(y_pred)
  y_true = np.squeeze(y_true)
  returns = np.where((pred_sign == 1), y_true, 0)
  return returns.mean()

custom_scorer = make_scorer(custom_score, greater_is_better=True)

3. Define the models we are going to use

In [3]:
estimator1 = Ridge()
estimator2 = Lasso(alpha=.001)
estimator3 = KNeighborsRegressor()
models = [estimator1,estimator2,estimator3]
estimator = VotingRegressor(estimators=[('Ridge', estimator1),
                                        ('Lasso', estimator2),
                                        ('KNN', estimator3),
                                        ])

4. Define target, cross validation folds, interval, and lookback parameters.

In [4]:
lookforward = 2
tscv = TimeSeriesSplit(n_splits=5, gap=lookforward)
step = 2
lookback = 2

5. Load data

In [5]:
spy = yf.download('SPY', start='2004-01-01')
agg = yf.download('AGG', start='2004-01-01')

spy = create_target(spy, lookforward, target='Open')

spy = features(spy, lookback, step)
spy = spy.add_suffix(' SPY')
agg = features(agg, lookback, step)
agg = agg.add_suffix(' AGG')
cv = pd.merge(spy, agg, how='inner', on='Date')

cv.drop(cv.tail(lookforward).index, inplace=True)
cv.drop(cv.head(lookback).index, inplace=True)
X = cv
y = X[['Target SPY']]
X = X.drop(columns=['Target SPY'])
X.fillna(method="ffill", inplace=True)
X.replace([np.inf, -np.inf], 0, inplace=True)
X.fillna(0, inplace=True)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


  X.fillna(method="ffill", inplace=True)


6. Define parameter grid. (Look at models on sklearns docs to find which parameters of a model you can change)

In [6]:
param_grid = {
    'Ridge__alpha': loguniform(1e-5, 1e0),
    'Lasso__alpha': loguniform(1e-5, 1e0),
    'KNN__n_neighbors': randint(1,10),
}

In [12]:
X.head()

Unnamed: 0_level_0,2 Spread SPY,2 Rolling Avg Spread SPY,2 Gap SPY,2 Rolling Avg Gap SPY,2 Intraday SPY,2 Rolling Avg Intraday SPY,2 Spread AGG,2 Rolling Avg Spread AGG,2 Gap AGG,2 Rolling Avg Gap AGG,2 Intraday AGG,2 Rolling Avg Intraday AGG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-01-06,-0.499997,0.830002,0.0,0.09,-1.764713,-0.57,-0.488097,0.41,0.0,0.110001,-1.362066,-0.18
2004-01-07,0.258062,0.950001,-1.347835,-0.220001,-0.279999,-0.465,-0.23078,0.364998,-2.666582,0.149998,0.19999,-0.195
2004-01-08,-0.123282,0.905003,-2.142861,0.079998,-0.666673,-0.334999,-0.116286,0.339996,-1.720001,-0.065002,-0.285704,-0.165001
2004-01-09,0.051287,0.935005,2.437393,-0.114998,-1.814818,0.155003,0.199995,0.369995,14.801465,0.305,-0.666681,-0.105
2004-01-12,0.390609,1.060001,-0.499988,-0.194996,4.153941,-0.114998,-0.078945,0.354996,-1.0,0.395,-2.933269,0.114998


7. Split train and test data and run Random Search on train data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, shuffle=False)
search = RandomizedSearchCV(estimator, param_distributions=param_grid, n_iter=100, cv=tscv, scoring=custom_scorer, n_jobs=-1, verbose=1)
search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

8. Print best parameters

In [13]:
print(search.best_params_)
print(search.best_score_)

{'KNN__n_neighbors': 3, 'Lasso__alpha': 0.0006978836225685575, 'Ridge__alpha': 2.5568940077327126e-05}
0.0003626155618374449


9. Backtest parameters

In [18]:
# !pip3 install backtesting
from backtesting import Strategy, Backtest

2.5568940077327126e-05

In [19]:
step = 2
lookback = 2

estimator1 = Ridge(alpha=search.best_params_['Ridge__alpha'])
estimator2 = Lasso(alpha=search.best_params_['Lasso__alpha'])
estimator3 = KNeighborsRegressor(n_neighbors=search.best_params_['KNN__n_neighbors'])
models = [estimator1,
          estimator2,
          estimator3,
          ]
estimator = VotingRegressor(estimators=[('Ridge', estimator1),
                                        ('Lasso', estimator2),
                                        ('KNN', estimator3),
                                        ],)

X_test = X_test.iloc[(abs(lookforward)):]
y_test = y_test.iloc[(abs(lookforward)):]

estimator.fit(X_train, y_train)
forecasted = estimator.predict(X_test)

data = yf.download('SPY', start='2004-01-01')
data.drop(data.tail(lookforward).index,inplace=True)
data.drop(data.head(lookback).index,inplace=True)
data = data.iloc[(-X_test.shape[0]):]
data['forecastedValue'] = forecasted
prediction = data

class MyStrategy(Strategy):
    Data = prediction

    def init(self):
        super().init()

    def next(self):
        if self.data.forecastedValue < 0:
            self.sell()
        elif self.data.forecastedValue > 0:
            self.buy()


bt = Backtest(prediction, MyStrategy,
              cash=1000,
              trade_on_close=False,
              exclusive_orders=True
              )
print(bt.run())

  y = column_or_1d(y, warn=True)


[*********************100%%**********************]  1 of 1 completed
Start                     2021-01-11 00:00:00
End                       2024-01-05 00:00:00
Duration                   1089 days 00:00:00
Exposure Time [%]                   99.734043
Equity Final [$]                   821.240662
Equity Peak [$]                   1130.900574
Return [%]                         -17.875934
Buy & Hold Return [%]               23.562811
Return (Ann.) [%]                   -6.386496
Volatility (Ann.) [%]               13.312282
Sharpe Ratio                              0.0
Sortino Ratio                             0.0
Calmar Ratio                              0.0
Max. Drawdown [%]                  -32.790683
Avg. Drawdown [%]                   -3.337251
Max. Drawdown Duration      794 days 00:00:00
Avg. Drawdown Duration       63 days 00:00:00
# Trades                                  750
Win Rate [%]                        52.533333
Best Trade [%]                       5.538214
Worst Trade

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['forecastedValue'] = forecasted


Surprisingly, the results are better than the previous notebook, despite that one cheating a little bit. The strategy beats buying and holding which is good. But would I run this strategy? No, but this is a good starting point.

Somethings to explore futher: Create more features, add more data sources, evaluate more models, evaluate more parameters, evaluate higher period step interval and look back periods, evaluate how many cross validation folds are optimal when taking the bias-variance trade-off into account, backtest with commission, etc.