### Load Data From Parquet

In [1]:
import ta

def prep_regime_filter(regime_df, roc_col_name='regime_roc', mv_col_name='regime_ma',
                       close_name='regime_close',
                       ma_period=200,
                       roc_period=45):
    regime_df.loc[:, roc_col_name] = ta.momentum.ROCIndicator(regime_df.close_adj, n=roc_period).roc()
    regime_df.loc[:, mv_col_name] = regime_df.set_index(
        'date'
    ).close_adj.rolling('%dd' % ma_period, min_periods=1).mean().values
    regime_df.loc[:, close_name] = regime_df.close_adj
    return regime_df

In [2]:
import pandas as pd
import collections

Market = collections.namedtuple('Market', 'candle_data market_index sec_index')

market_metadata = {
    600: Market(candle_data='historical_pr_data_600_sec.parquet',
                market_index='S_and_P_600_index.parquet',
                sec_index='sector_index_1000.parquet'),
    400: Market(candle_data='historical_pr_data_400_sec.parquet',
                market_index='S_and_P_400_index.parquet',
                sec_index='sector_index_400.parquet'),
    500: Market(candle_data='historical_pr_data_sec.parquet',
                market_index='S_and_P_index.parquet',
                sec_index='sector_index.parquet')
}

candle_dfs = {
    x: pd.read_parquet(market_metadata[x].candle_data)
    for x in market_metadata
}

In [3]:
market_index_dfs = {
    x: pd.read_parquet(market_metadata[x].market_index)
    for x in market_metadata
}

In [4]:
sector_index_dfs = {
    x: pd.read_parquet(market_metadata[x].sec_index)
    for x in market_metadata
}

In [5]:
prep_market_regimes_dfs = {
    x: prep_regime_filter(market_index_dfs[x])
    for x in market_index_dfs
}

In [6]:
prep_sector_regime_dfs = {
    x: sector_index_dfs[x].groupby('sector').apply(lambda x: prep_regime_filter(x, roc_col_name='sector_roc',
                                                                     mv_col_name='sector_ma',
                                                                     close_name='sector_close'))
    for x in sector_index_dfs
}

In [7]:
candle_w_regime = {
    x: candle_dfs[x].merge(prep_market_regimes_dfs[x][['date', 'regime_roc', 'regime_close', 'regime_ma']],
                           on='date', how='left')
    for x in candle_dfs
}
candle_w_regime = {
    x: candle_w_regime[x].merge(prep_sector_regime_dfs[x][['date', 'sector', 'sector_roc', 'sector_close', 'sector_ma']],
                                on=['date', 'sector'], how='left')
    for x in candle_w_regime
}

In [8]:
df_norm = pd.concat((candle_w_regime[x] for x in candle_w_regime), axis=0, ignore_index=True)

In [9]:
df_norm = df_norm.drop_duplicates(subset=['date', 'symbol']).sort_values(['symbol', 'date']).reset_index(drop=True)

In [10]:
import ta

def get_indicators(sym_df, mfi_period=7, roc_period=45, ma_period=200):
    mfi = ta.momentum.MFIIndicator(
        high=sym_df.high_adj, low=sym_df.low_adj,
        close=sym_df.close_adj, volume=sym_df.volume,
        n=mfi_period
    ).money_flow_index()
    sym_df.loc[:, 'mfi'] = mfi
    sym_df.loc[:, 'roc'] = ta.momentum.ROCIndicator(sym_df.close_adj, n=roc_period).roc()
    sym_df.loc[:, 'mv_avg'] = sym_df.set_index(
        'date'
    ).close_adj.rolling('%dd' % ma_period, min_periods=1).mean().values
    return sym_df

In [11]:
df_ind = df_norm.reset_index(drop=True).groupby('symbol').apply(get_indicators).reset_index(drop=True)

In [12]:
def get_entrances(df,
                  mfi_oversold=20,
                  roc_change=0,
                  regime_roc_change=0):
    df.loc[:, 'last_mfi'] = df.mfi.shift(1)
    df.loc[:, 'mfi_roc'] = df.mfi - df.last_mfi
    df.loc[:, 'last_mfi_roc'] = df.mfi_roc.shift(1)
    df.loc[:, 'mfi_oversold_enter'] = 0
    bool_index = (df.close_adj > df.mv_avg)
    #bool_index &= (df.regime_roc > regime_roc_change)
    #bool_index &= (df.regime_close > df.regime_ma)
    #bool_index &= (df.sector_roc > regime_roc_change)
    bool_index &= (df.sector_close > df.sector_ma)
    bool_index &= (df.mfi <= mfi_oversold) & (df.roc > roc_change)
    df.loc[bool_index, 'mfi_oversold_enter'] = 1
    enter_cols = ['mfi_oversold_enter']
    df.loc[:, 'entrances'] = df[enter_cols].sum(axis=1).clip(upper=1)
    return df

In [13]:
def get_exits(df, mfi_overbought=80):
    df.loc[:, 'close_adj_last'] = df.close_adj.shift(1)
    df.loc[:, 'mfi_overbought_exit'] = 0
    df.loc[df.mfi >= mfi_overbought, 'mfi_overbought_exit'] = 1
    exit_cols = ['mfi_overbought_exit']
    df.loc[:, 'exits'] = df[exit_cols].sum(axis=1).clip(upper=1)
    return df

In [14]:
df_enter_exit = df_ind.groupby('symbol').apply(lambda x: get_entrances(get_exits(x), roc_change=7))

In [15]:
from numba import jit
import numpy as np

def backtest_seq(df, stop_thresh=0.1, run_length=30, inv_price=10000,
                 prof_avg_offset=30, ewm_prof_offset=100):
    df.loc[:, 'enter_exit_sig'] = df.entrances - df.exits
    df.loc[:, 'next_open'] = df.open_adj.shift(-1)
    profit, actual_enter_exit, shares_arr = backtest_numba(
        df.enter_exit_sig.values, df.close_adj.values,
        df.next_open.values, stop_thresh, run_length,
        inv_price
    )
    df.loc[:, 'profit'] = profit
    df.loc[:, 'cum_profit'] = df.profit.fillna(0).cumsum()
    df.loc[:, 'purch_shares'] = shares_arr
    df.loc[:, 'norm_profit'] = profit / (df.next_open * shares_arr)
    df.loc[df.profit == 0, 'norm_profit'] = np.nan
    df.loc[:, 'avg_profit'] = df.norm_profit.rolling(prof_avg_offset, min_periods=1).mean()
    df.loc[:, 'avg_profit_std'] = df.norm_profit.rolling(prof_avg_offset, min_periods=1).std()
    df.loc[:, 'eavg_profit'] = df.avg_profit.ewm(ewm_prof_offset, ignore_na=True).mean()
    df.loc[:, 'avg_profit'] = df.avg_profit.fillna(0)
    df.loc[:, 'actual_enter_exit'] = actual_enter_exit
    df.loc[:, 'actual_enter'] = 0
    df.loc[:, 'actual_exit'] = 0
    df.loc[df.actual_enter_exit == 1, 'actual_enter'] = 1
    df.loc[df.actual_enter_exit == -1, 'actual_exit'] = 1
    df.loc[:, 'trade_count'] = df.actual_enter_exit.rolling(prof_avg_offset).sum()
    return df
    

@jit(nopython=True)
def backtest_numba(enter_exit, close_price, open_price, stop_thresh,
                   run_length, inv_price):
    in_trade = False
    n = len(enter_exit)
    actual_enter_exit = np.zeros(n)
    shares_arr = np.zeros(n)
    profit = np.zeros(n)
    start_price = 0.0
    top_price = start_price
    shares = 0
    for index in range(0, n):
        signal = enter_exit[index]
        if in_trade and close_price[index] > top_price:
            top_price = close_price[index]
        if not in_trade and signal == 1:
            enter_price = open_price[index]
            start_price = close_price[index]
            top_price = start_price
            shares = int(inv_price / start_price)
            shares_arr[index] = shares
            shares_cost = enter_price * shares
            actual_enter_exit[index] = 1
            in_trade = True
            enter_index = index
        elif in_trade and ((signal == -1) or ((index - enter_index) >= run_length)):
            profit[enter_index] = (open_price[index] - enter_price) * shares
            actual_enter_exit[index] = -1
            in_trade = False
        elif in_trade and ((top_price - close_price[index]) / top_price) >= stop_thresh:
            profit[enter_index] = (open_price[index] - enter_price) * shares
            actual_enter_exit[index] = -1
            in_trade = False
        elif index == (n - 1) and in_trade:
            profit[enter_index] = (open_price[index] - enter_price) * shares
            actual_enter_exit[index] = -1
            in_trade = False
    return profit, actual_enter_exit, shares_arr

In [16]:
df_profits1 = df_enter_exit.groupby('symbol').apply(lambda x: backtest_seq(x, stop_thresh=1.0, inv_price=10000, run_length=100))

In [17]:
import numpy as np

def get_profit_metrics(df_profits):
    wins_losses = {}
    col_name = 'profit'
    win_index = df_profits[col_name] > 0
    loss_index = df_profits[col_name] < 0
    mean_win = df_profits.loc[win_index, col_name].mean()
    mean_loss = df_profits.loc[loss_index, col_name].mean()
    mean_norm_profit_win = df_profits.loc[win_index, 'norm_profit'].mean()
    mean_norm_profit_loss = df_profits.loc[loss_index, 'norm_profit'].mean()
    mean_norm_profit = df_profits.norm_profit.mean()
    sum_win = df_profits.loc[win_index, col_name].sum()
    sum_loss = df_profits.loc[loss_index, col_name].sum()
    
    wins_losses[col_name] = [win_index.sum(), loss_index.sum(), win_index.sum() + loss_index.sum(),
                             mean_win, mean_loss,
                             mean_norm_profit_win, mean_norm_profit_loss,
                             mean_norm_profit,
                             sum_win, sum_loss
                            ]

    df_win_loss = pd.DataFrame(wins_losses, index=['wins', 'losses', 'ttl_trades', 'mean_win',
                                                   'mean_loss',
                                                   'mean_norm_profit_win', 'mean_norm_profit_loss',
                                                   'mean_norm_profit',
                                                   'ttl_win', 'ttl_loss']).transpose()
    df_win_loss.loc[:, 'win_loss_rate'] =  df_win_loss.wins / (df_win_loss.losses + df_win_loss.wins)
    df_win_loss.loc[:, 'win_loss_ratio'] = df_win_loss.mean_win / np.abs(df_win_loss.mean_loss)
    
    df_win_loss.loc[:, 'profit_factor'] = df_win_loss.ttl_win / np.abs(df_win_loss.ttl_loss)
    df_win_loss.loc[:, 'net_profit'] = df_win_loss.ttl_win + df_win_loss.ttl_loss
    return df_win_loss

In [18]:
df_win_loss = get_profit_metrics(df_profits1)

In [None]:
df_win_loss_year = df_profits1.groupby('year').apply(get_profit_metrics)

In [19]:
df_win_loss

Unnamed: 0,wins,losses,ttl_trades,mean_win,mean_loss,mean_norm_profit_win,mean_norm_profit_loss,mean_norm_profit,ttl_win,ttl_loss,win_loss_rate,win_loss_ratio,profit_factor,net_profit
profit,11815.0,5411.0,17226.0,742.458965,-884.072472,0.074378,-0.088558,0.023196,8772153.0,-4783716.0,0.685882,0.839817,1.833753,3988437.0


In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss

In [None]:
df_win_loss_year

In [None]:
df_win_loss_year

In [None]:
df_win_loss_year

In [21]:
df_profits1.to_parquet('profits/MFIROC.parquet')

### Improvements 
* Look for price increase after reversion based indicators - V1
* Weight examples in XGBoost by Price. Look at training vs testing profit - V2
* Add Bollinger band based indicator - V3
* Add Regime Filter - V4
* Use Symbol Performance based filtering - V5
* Look at performance across Industry