In [3]:
## Imports


from pandas_datareader import data as pdr
import yfinance as yf
import ta
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, auc, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

from backtesting import Backtest, Strategy
from backtesting.lib import crossover



In [4]:
## Data Collection Function

# This function uses Yahoo Finance to pull historic stock data.

# date format is 2010-12-26

def stock_data_puller(ticker, start_date, end_date):
    ticker0 = yf.Ticker(ticker)
    df = (ticker0.history(start=start_date, end=end_date))
    df.sort_index(inplace=True)
    return df    

In [5]:
spy_df = stock_data_puller('SPY', '2010-11-01', '2020-11-01')

In [6]:
spy_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-11-01,97.482841,98.039558,96.484023,97.040741,174074800,0.0,0
2010-11-02,97.769373,98.039546,97.507388,97.81031,158345900,0.0,0
2010-11-03,97.982231,98.260586,96.975225,98.203278,226702800,0.0,0
2010-11-04,99.292167,100.143618,98.21967,100.094498,215039400,0.0,0
2010-11-05,100.159989,100.634838,100.029,100.4711,180654100,0.0,0


In [7]:
## TRADING SIGNALS GENERATOR

# this function will get the trading indicators for the three basic types of trading indicators in the TA libirary.
# the first, the so called awesome indicator is an example of one where a long moving average crosses a short moving 
# average to create a trading indiactor. 

# Types of Indicators

# awesome oscillator - this on was chosen because the trading singal is when the indicator goes above or below zero.
# kaufman's moving average - this one creates two indicators, the trading signal changes when they cross over each other.
# rate of chance - this one gives of a trading indicator when the values go outside a band between 15 and -15.

# LIST OF INPUTS

# awe_osc_s_p - the short period for the awesome osciallator. the default value is 5.
# awe_osc_l_p - the long period for the awesome osciallator. the default value is 34.
# kama_n - n period for kama. the default value is 10.
# kama_sig_n - n period for kama signal. the default value is 30.
# kama_pow_1 - number of periods for the fastest EMA constant. the default value is 2.
# kama_pow_2 - number of periods for the slowest EMA constant. the default value is 30.
# roc_n - number of periods rate of change uses to compare the current price with a previous price. the default value is 12.
# roc_limit - roc gives of a signal if the value croses this value or it's negative. the default value is 15.
# nvi_short_window

# list of default values in order 5, 34, 10, 30, 2, 30, 12, 15, 10

# LIST OF OUTPUTS

# the function creates two dataframes, dataframe and ts_dataframe. ts_dataframe consists of the trading
# indicators and the return.

# awe_osc_ts - the trading signal generated when the awesome oscillator goes above or bellow zero.
# kama_ts - the trading singal generated when the two indicators cross each other.
# roc_ts - the trading signal generated when the indicator goes above or below certain values.

def ts_generator(df, awe_osc_sp, awe_osc_lp, kama_n, kama_sig_n, kama_pow_1, kama_pow_2, roc_n, roc_limit, nvi_short_win):
    
    awe_osc = ta.momentum.AwesomeOscillatorIndicator(df['High'], df['Low'], awe_osc_sp, awe_osc_lp, fillna=False)
    df['awe_osc'] = awe_osc.ao()
    df['awe_osc_ts'] = 0
    df['awe_osc_ts'][awe_osc_lp:] = np.where((df['awe_osc'][awe_osc_lp:] > 0) & (df['awe_osc'][awe_osc_lp:].shift(-1) < 0), 1, 0)
    df['awe_osc_ts'][awe_osc_lp:] = np.where((df['awe_osc'][awe_osc_lp:] < 0) & (df['awe_osc'][awe_osc_lp:].shift(-1) > 0), -1, 0)

    kama = ta.momentum.KAMAIndicator(close=df['Close'], n=kama_n, pow1=kama_pow_1, pow2=kama_pow_2, fillna=False)
    df['kama'] = kama.kama()
    kama_sig = ta.momentum.KAMAIndicator(close=df['Close'], n=kama_sig_n, pow1=kama_pow_1, pow2=kama_pow_2, fillna=False)
    df['kama_sig'] = kama_sig.kama()
    df['kama_trading_signal_0'] = 0
    df['kama_trading_signal_0'][kama_n:] = np.where(df['kama'][kama_n:] > df['kama_sig'][kama_n:], 1, 0)
    df['kama_ts'] = df['kama_trading_signal_0'].diff()
    
    roc = ta.momentum.ROCIndicator(close=df['Close'], n=roc_n, fillna=False)
    df['roc'] = roc.roc()
    df['roc_signal'] = 0
    df['roc_signal'][roc_n:] = np.where(df['roc'][roc_n:] > roc_limit, 1.0, 0.0) + np.where(df['roc'][roc_n:] < -roc_limit, 1.0, 0.0)
    df['roc_ts'] = df['roc_signal'].diff()
    
    nvi = ta.volume.NegativeVolumeIndexIndicator(close=df['Close'], volume=df['Volume'], fillna=False)
    df['nvi'] = nvi.negative_volume_index()
    df['nvi_255_ema'] = df['nvi'].ewm(span=255).mean()
    nvi_short_window = 10
    df['nvi_trading_signal_0'] = 0
    df['nvi_trading_signal_0'][nvi_short_window:] = np.where(df['nvi'][nvi_short_window:] > df['nvi_255_ema'][nvi_short_window:], 1, 0)
    df['nvi_ts'] = df['nvi_trading_signal_0'].diff()
    
    macd = ta.trend.MACD(close=df['Close'], n_slow=26, n_fast=12, n_sign=9, fillna=False)
    df['macd'] = macd.macd()
    df['macd_diff'] = macd.macd_diff()
    df['macd_sig'] = macd.macd_signal()
    short_window=10
    df['macd_trading_signal_0'] = 0
    df['macd_trading_signal_0'][short_window:] = np.where(df['macd'][short_window:] > df['macd_sig'][short_window:], 1, 0)
    df['macd_ts'] = df['macd_trading_signal_0'].diff()

    
    df['daily_return'] = df['Close'].pct_change()
    
    df['positive_return'] = np.where((df['daily_return'] > 0), 1, 0)
    
## this section begins constructing the rules of thumb. the first step is to check whether the trading signals correctly
## predicted positive or negative returns. since the return will rarely be exactly 0 hold is ignored. pass is used to prevent
## overwriting a correct positive generated by the previous line.
## ASK ABOUT THIS.
    
    df['negative_return'] = np.where((df['daily_return'] < 0), 1, 0)
    
    df['awe_osc_cor'] = np.where((df['awe_osc_ts'] == 1) & (df['positive_return'] == 1) | (df['awe_osc_ts'] == -1) & (df['negative_return'] == -1), 1, 0)
    df['kama_cor'] = np.where((df['kama_ts'] == 1) & (df['positive_return'] == 1) | (df['kama_ts'] == -1) & (df['negative_return'] == -1), 1, 0)
    df['roc_cor'] = np.where((df['roc_ts'] == 1) & (df['positive_return'] == 1) | (df['roc_ts'] == -1) & (df['negative_return'] == -1), 1, 0)

## the second step is to see how many times the signal has been correct in a given time period.

    df['awe_osc_one_wk'] = df['awe_osc_cor'].rolling(5).sum()
    df['awe_osc_two_wk'] = df['awe_osc_cor'].rolling(10).sum()
    df['awe_osc_three_wk'] = df['awe_osc_cor'].rolling(15).sum()
    df['awe_osc_four_wk'] = df['awe_osc_cor'].rolling(20).sum()
    df['kama_one_wk'] = df['kama_cor'].rolling(5).sum()
    df['kama_two_wk'] = df['kama_cor'].rolling(10).sum()
    df['kama_three_wk'] = df['kama_cor'].rolling(15).sum()
    df['kama_four_wk'] = df['kama_cor'].rolling(20).sum() 
    df['roc_one_wk'] = df['roc_cor'].rolling(5).sum()
    df['roc_two_wk'] = df['roc_cor'].rolling(10).sum()
    df['roc_three_wk'] = df['roc_cor'].rolling(15).sum()
    df['roc_four_wk'] = df['roc_cor'].rolling(20).sum()
    
# ## the next set of code takes the column with the best score over the given time period. this creates column with the
# ## names of the most successful trading signal. these can be run later and compared to the feature importances.

    df['best_one_wk'] = df[['awe_osc_one_wk', 'kama_one_wk', 'roc_one_wk']].idxmax(axis=1)
    df['best_two_wk'] = df[['awe_osc_two_wk', 'kama_two_wk', 'roc_two_wk']].idxmax(axis=1)
    df['best_three_wk'] = df[['awe_osc_three_wk', 'kama_three_wk', 'roc_three_wk']].idxmax(axis=1)
    df['best_four_wk'] = df[['awe_osc_four_wk', 'kama_four_wk', 'roc_four_wk']].idxmax(axis=1)
    
# ## this creates a column with the value from the best trading signal. ASK ABOUT SIMPLYFYING THIS OR USING A FOR LOOP.

    df.loc[df['best_one_wk'] == 'awe_osc_one_wk', 'best_one_wk_ts'] = df['awe_osc_ts']
    df.loc[df['best_two_wk'] == 'awe_osc_two_wk', 'best_two_wk_ts'] = df['awe_osc_ts']
    df.loc[df['best_three_wk'] == 'awe_osc_three_wk', 'best_three_wk_ts'] = df['awe_osc_ts']
    df.loc[df['best_four_wk'] == 'awe_osc_four_wk', 'best_four_wk_ts'] = df['awe_osc_ts']
    df.loc[df['best_one_wk'] == 'kama_one_wk', 'best_one_wk_ts'] = df['kama_ts']
    df.loc[df['best_two_wk'] == 'kama_two_wk', 'best_two_wk_ts'] = df['kama_ts']
    df.loc[df['best_three_wk'] == 'kama_three_wk', 'best_three_wk_ts'] = df['kama_ts']
    df.loc[df['best_four_wk'] == 'kama_four_wk', 'best_four_wk_ts'] = df['kama_ts']
    df.loc[df['best_one_wk'] == 'roc_one_wk', 'best_one_wk_ts'] = df['roc_ts']
    df.loc[df['best_two_wk'] == 'roc_two_wk', 'best_two_wk_ts'] = df['roc_ts']
    df.loc[df['best_three_wk'] == 'roc_three_wk', 'best_three_wk_ts'] = df['roc_ts']
    df.loc[df['best_four_wk'] == 'roc_four_wk', 'best_four_wk_ts'] = df['roc_ts']
    

## The next rule of thumb is two of the best three. the sell recommendatation has to be coded differently to
## account for the situation in which two say sell and one says buy which prevents the sum from being used. (-1 + 1 + -1)

    df['two_agree_ts'] = np.where(df['awe_osc_ts'] + df['kama_ts'] + df['roc_ts'] > 2, 1, 0)
    df.loc[(df['awe_osc_ts'] == -1) & (df['kama_ts'] == -1) | (df['awe_osc_ts'] == -1) & (df['roc_ts'] == -1) | (df['kama_ts'] == -1) & (df['roc_ts'] == -1), 'two_agree_ts'] = -1
       
    ts_df = df[['awe_osc_ts', 'kama_ts', 'roc_ts', 'nvi_ts', 'macd_ts', 'daily_return', 'positive_return','best_one_wk_ts', 'best_two_wk_ts', 'best_three_wk_ts','best_four_wk_ts', 'two_agree_ts']].copy()
    
    return ts_df.dropna()

In [8]:
ts_df = ts_generator(spy_df, 5, 34, 10, 30, 2, 30, 12, 15, 10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [230]:
ts_df.tail()

Unnamed: 0_level_0,awe_osc_ts,kama_ts,roc_ts,nvi_ts,macd_ts,daily_return,positive_return,best_one_wk_ts,best_two_wk_ts,best_three_wk_ts,best_four_wk_ts,two_agree_ts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-10-26,0,0.0,0.0,0.0,0.0,-0.01848,0,0.0,0.0,0.0,0.0,0
2020-10-27,0,0.0,0.0,0.0,0.0,-0.003447,0,0.0,0.0,0.0,0.0,0
2020-10-28,0,-1.0,0.0,0.0,0.0,-0.034179,0,0.0,0.0,-1.0,-1.0,0
2020-10-29,0,0.0,0.0,0.0,0.0,0.010163,1,0.0,0.0,0.0,0.0,0
2020-10-30,0,0.0,0.0,0.0,0.0,-0.010425,0,0.0,0.0,0.0,0.0,0


In [231]:
ts_df[['awe_osc_ts', 'kama_ts', 'roc_ts', 'nvi_ts', 'macd_ts']]

Unnamed: 0_level_0,awe_osc_ts,kama_ts,roc_ts,nvi_ts,macd_ts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-11-29,0,0.0,0.0,0.0,0.0
2010-11-30,0,0.0,0.0,0.0,0.0
2010-12-01,0,0.0,0.0,0.0,0.0
2010-12-02,0,0.0,0.0,0.0,0.0
2010-12-03,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
2020-10-26,0,0.0,0.0,0.0,0.0
2020-10-27,0,0.0,0.0,0.0,0.0
2020-10-28,0,-1.0,0.0,0.0,0.0
2020-10-29,0,0.0,0.0,0.0,0.0


In [232]:
ts_indicators = pd.concat([ts_df[['awe_osc_ts', 'kama_ts', 'roc_ts', 'macd_ts', 'nvi_ts']], spy_df[['Open','High', 'Low', 'Close', 'Volume']]], axis=1)
ts_indicators = ts_indicators.dropna()

In [233]:
ts_indicators.describe()

Unnamed: 0,awe_osc_ts,kama_ts,roc_ts,macd_ts,nvi_ts,Open,High,Low,Close,Volume
count,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0
mean,-0.019608,0.0,0.0,0.0,0.0,197.276011,198.283212,196.175317,197.301791,116878900.0
std,0.138676,0.16499,0.074863,0.285771,0.0,67.124214,67.49728,66.689855,67.102837,67919110.0
min,-1.0,-1.0,-1.0,-1.0,0.0,90.469891,94.001852,89.701713,91.789177,20270000.0
25%,0.0,0.0,0.0,0.0,0.0,139.998977,141.00836,139.533816,140.335464,71214050.0
50%,0.0,0.0,0.0,0.0,0.0,187.946688,188.956603,187.178992,188.019241,99530200.0
75%,0.0,0.0,0.0,0.0,0.0,255.710611,258.109118,253.741211,255.306366,142529400.0
max,0.0,1.0,1.0,1.0,0.0,354.451136,357.319666,352.02087,356.273865,717828700.0


In [238]:
## Functions to generate X and y in the format needed to pass them to the backtesting package. (This is adapted from 
## the documentation.)

## Create a Function that pulls the data in the needed format. date format is 
## datetime.datetime(2010, 1, 1)

def stock (ticker, start, end):
    stock =  yf.download(ticker, progress=True, actions=True,start=start, end=end)
    stock = pd.DataFrame(stock)
    return stock.dropna()

In [239]:
## Random Forest

N_TRAIN = 1000


class RandomForest(Strategy):
    price_delta = .004  # 0.4%

    def init(self):        
        # Init our model, a kNN classifier
        self.clf = RandomForestClassifier(n_estimators=500, max_depth=100, random_state=0)

        # Train the classifier in advance on the first N_TRAIN examples
        df = self.data.df.iloc[:N_TRAIN]
        X, y = get_clean_Xy(ts_indicators)
        self.clf.fit(X, y)

        # Plot y for inspection
        self.I(get_y, self.data.df, name='y_true')

        # Prepare empty, all-NaN forecast indicator
        self.forecasts = self.I(lambda: np.repeat(np.nan, len(self.data)), name='forecast')
        
        
    def next(self):
        # Skip the training, in-sample data
        if len(self.data) < N_TRAIN:
            return

        # Proceed only with out-of-sample data. Prepare some variables
        high, low, close = self.data.High, self.data.Low, self.data.Close
        current_time = self.data.index[-1]

        # Forecast the next movement
        X = get_X(self.data.df.iloc[-1:])
        forecast = self.clf.predict(X)[0]

        # Update the plotted "forecast" indicator
        self.forecasts[-1] = forecast

        # If our forecast is upwards and we don't already hold a long position
        # place a long order for 20% of available account equity. Vice versa for short.
        # Also set target take-profit and stop-loss prices to be one price_delta
        # away from the current closing price.
        upper, lower = close[-1] * (1 + np.r_[1, -1]*self.price_delta)

        if forecast == 1 and not self.position.is_long:
            self.buy(size=.2, tp=upper, sl=lower)
        elif forecast == -1 and not self.position.is_short:
            self.sell(size=.2, tp=lower, sl=upper)

        # Additionally, set aggressive stop-loss on trades that have been open 
        # for more than two days
        for trade in self.trades:
            if current_time - trade.entry_time > pd.Timedelta('2 days'):
                if trade.is_long:
                    trade.sl = max(trade.sl, low)
                else:
                    trade.sl = min(trade.sl, high)


bt = Backtest(ts_indicators, RandomForest, commission=.0002, margin=.05)
bt.run()

Start                     2010-11-29 00:00:00
End                       2020-10-30 00:00:00
Duration                   3623 days 00:00:00
Exposure Time [%]                     59.1437
Equity Final [$]                      110.802
Equity Peak [$]                       10138.1
Return [%]                            -98.892
Buy & Hold Return [%]                 234.719
Return (Ann.) [%]                    -36.4945
Volatility (Ann.) [%]                 10.4763
Sharpe Ratio                                0
Sortino Ratio                               0
Calmar Ratio                                0
Max. Drawdown [%]                    -98.9071
Avg. Drawdown [%]                    -98.9071
Max. Drawdown Duration     2173 days 00:00:00
Avg. Drawdown Duration     2173 days 00:00:00
# Trades                                 1203
Win Rate [%]                          27.0158
Best Trade [%]                        1.08087
Worst Trade [%]                      -3.44149
Avg. Trade [%]                    

RuntimeError: First issue `backtest.run()` to obtain results.

In [None]:
N_TRAIN = 1000

In [9]:
## Random Forest w/XGBoost



class RandomForestXGBoost(RandomForest):
    price_delta = .004  # 0.4%

    def init(self):        
        # Init our model, a kNN classifier
        self.clf = xgb.XGBRFRegressor(random_state=0)

        # Train the classifier in advance on the first N_TRAIN examples
        df = self.data.df.iloc[:N_TRAIN]
        X, y = get_clean_Xy(df)
        self.clf.fit(X, y)

        # Plot y for inspection
        self.I(get_y, self.data.df, name='y_true')

        # Prepare empty, all-NaN forecast indicator
        self.forecasts = self.I(lambda: np.repeat(np.nan, len(self.data)), name='forecast')        
        super().next()


bt = Backtest(ts_indicators, RandomForestXGBoost, commission=.0002, margin=.05)
bt.run()

NameError: name 'RandomForest' is not defined

In [241]:
## Logistic Regression

NTRAIN = 400

class LogisticRegression(RandomForest):
    price_delta = .004  # 0.4%

    def init(self):        
        # Init our model, a kNN classifier
        self.clf = LogisticRegression()

        # Train the classifier in advance on the first N_TRAIN examples
        df = self.data.df.iloc[:N_TRAIN]
        X, y = get_clean_Xy(df)
        self.clf.fit(X, y)

        # Plot y for inspection
        self.I(get_y, self.data.df, name='y_true')

        # Prepare empty, all-NaN forecast indicator
        self.forecasts = self.I(lambda: np.repeat(np.nan, len(self.data)), name='forecast')        
        super().next()


bt = Backtest(ts_indicators, LogisticRegression, commission=.0002, margin=.05)
bt.run()

TypeError: __init__() missing 3 required positional arguments: 'broker', 'data', and 'params'

In [None]:
## Logistic Regression

N_TRAIN = 400

#N_TRAIN = len((get_clean_Xy(ts_df))) * .7

class LogisticRegression(Strategy):
    price_delta = .004  # 0.4%

    def init(self):        
        # Init our model, a kNN classifier
        
        # Init needs broker, data, params
        
        
        self.clf = LogisticRegression()

        # Train the classifier in advance on the first N_TRAIN examples
        df = self.data.df.iloc[:N_TRAIN]
        X, y = get_clean_Xy(ts_indicators)
        self.clf.fit(X, y)

        # Plot y for inspection
        self.I(get_y, self.data.df, name='y_true')

        # Prepare empty, all-NaN forecast indicator
        self.forecasts = self.I(lambda: np.repeat(np.nan, len(self.data)), name='forecast')
        
        
    def next(self):
        # Skip the training, in-sample data
        if len(self.data) < N_TRAIN:
            return

        # Proceed only with out-of-sample data. Prepare some variables
        high, low, close = self.data.High, self.data.Low, self.data.Close
        current_time = self.data.index[-1]

        # Forecast the next movement
        X = get_X(self.data.df.iloc[-1:])
        forecast = self.clf.predict(X)[0]

        # Update the plotted "forecast" indicator
        self.forecasts[-1] = forecast

        # If our forecast is upwards and we don't already hold a long position
        # place a long order for 20% of available account equity. Vice versa for short.
        # Also set target take-profit and stop-loss prices to be one price_delta
        # away from the current closing price.
        upper, lower = close[-1] * (1 + np.r_[1, -1]*self.price_delta)

        if forecast == 1 and not self.position.is_long:
            self.buy(size=.2, tp=upper, sl=lower)
        elif forecast == -1 and not self.position.is_short:
            self.sell(size=.2, tp=lower, sl=upper)

        # Additionally, set aggressive stop-loss on trades that have been open 
        # for more than two days
        for trade in self.trades:
            if current_time - trade.entry_time > pd.Timedelta('2 days'):
                if trade.is_long:
                    trade.sl = max(trade.sl, low)
                else:
                    trade.sl = min(trade.sl, high)


bt = Backtest(ts_indicators, LogisticRegression, commission=.0002, margin=.05)
bt.run()

In [None]:
class ExampleStrategy(SignalStrategy):
    def init(self):
        super().init()
        self.set_signal(sma1 > sma2, sma1 < sma2)

In [None]:
## add a few more indicators, look at a couple more stocks, 