In [1]:
import os

os.chdir('/Users/brianmcclanahan/git_repos/StrategyPortfolioTrading/portfolios/sml_equity')

In [2]:
feat_col = ['rsi', 'roc', 'mfi', 'sto', 'bba', 'dch',
            'sector_ma_diff_norm', 'roc_short',
            'volatility', 'equity_curve_ratio', 'Volume', 'adj_close', 'Close',
            ] # 'CommonStockSharesOutstanding', 'EarningsPerShareBasic', 'MarketCap'

In [3]:
strategy_index = {'MFI': 0, 'STO': 1, 'RSI': 2}

In [4]:
fill_na_dict = {'rsi': 100, 'roc': 0, 'mfi': 100, 'sto': 100}

In [5]:
ml_metadata = {'feat_col': feat_col, 'strategy_index': strategy_index, 'fill_na_dict': fill_na_dict}


In [6]:
import pickle

with open('ml_metadata_mr.pkl', 'wb') as ml_meta_file:
    pickle.dump(ml_metadata, ml_meta_file)

In [7]:
import os
import pandas as pd

def load_profits(fill_na_dict, strategy_index, profit_folder='profit_reports/mean_reversion'):
    profit_files = [x for x in os.listdir(profit_folder) if x.endswith('.parquet')]
    all_cols = feat_col + ['symbol', 'date', 'norm_profit', 'actual_enter', 'actual_exit', 'exits', 'volatility_short']
    dfs = {x.split('_')[0]: pd.read_parquet('%s/%s' % (profit_folder, x))[all_cols] for x in profit_files}
    for key in dfs:
        dfs[key].loc[:, 'strategy_ind'] = strategy_index[key]
    df = pd.concat([dfs[x].fillna(fill_na_dict) for x in dfs], axis=0, ignore_index=True)
    return df

In [8]:
df = load_profits(fill_na_dict, strategy_index)

In [9]:
import numpy as np
import datetime as dt

def get_ml_data(feature_df, feat_col,
                train_date_range=(dt.datetime(2000, 1, 1), dt.datetime(2014, 12, 31)),
                test_date_range=(dt.datetime(2015, 1, 1), dt.datetime(2021, 1, 1))):
    feature_df = feature_df.loc[feature_df.actual_enter == 1]
    aux_cols = ['date', 'symbol', 'actual_enter', 'norm_profit', 'actual_exit', 'strategy_ind']
    feats = feature_df.loc[:, feat_col + aux_cols + ['norm_profit']]\
                      .sort_values('date')
    all_data = feature_df.loc[:, feat_col + aux_cols]
    all_aux = feature_df.loc[:, aux_cols]
    norm_profits = feats.norm_profit
    target = (norm_profits > 0).astype(int)
    train_index = feats['date'].between(*train_date_range).values
    test_index = feats['date'].between(*test_date_range).values
    train_aux = feats.loc[train_index, aux_cols]
    test_aux = feats.loc[test_index, aux_cols]
    feats = feats[feat_col]
    train_x = feats.loc[train_index]
    train_y = target.loc[train_index]
    test_x = feats.loc[test_index]
    test_y = target.loc[test_index]
    data_dict = {
        'train_x': train_x, 'train_y': train_y,
        'test_x': test_x, 'test_y': test_y,
        'train_aux': train_aux,
        'test_aux': test_aux,
        'all_data': all_data,
        'all_aux': all_aux,
        'ml_cols': feat_col
    }
    return data_dict

In [10]:
data_dict = get_ml_data(df, feat_col)

In [11]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=5)#, class_weight='balanced')
rfc = rfc.fit(data_dict['train_x'].fillna(0), data_dict['train_y'])#, sample_weight=np.abs(data_dict['train_aux']['norm_profit'].values))
1- rfc.score(data_dict['train_x'].fillna(0), data_dict['train_y'])#0.2948670645686381

0.35912098684747973

In [12]:
1- rfc.score(data_dict['test_x'].fillna(0), data_dict['test_y']) # 0.3261664800298619

0.43853294091011996

In [None]:
np.array(feat_col)[np.argsort(rfc.feature_importances_)], len(rfc.feature_importances_)

In [None]:
np.array(feat_col)[np.argsort(rfc.feature_importances_)]

### Don't run the cell below if testing

In [13]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=5)#, class_weight='balanced')

all_data = np.vstack((data_dict['train_x'].fillna(0), data_dict['test_x'].fillna(0)))
all_labels = np.vstack((data_dict['train_y'].fillna(0), data_dict['test_y'].fillna(0)))
rfc = rfc.fit(all_data, all_labels)
1 - rfc.score(all_data, all_labels)

0.3930486358244365

In [14]:
all_data = data_dict['all_data']
all_data_entries = all_data.loc[:, data_dict['ml_cols']]
all_preds = rfc.predict_proba(all_data_entries.fillna(0))[0][:, 1]

In [15]:
import pandas as pd

eval_tr = data_dict['all_data'].copy()
data_dict['all_aux'] = data_dict['all_aux'].drop('norm_profit', axis=1)
data_dict['all_aux'] = data_dict['all_aux'].drop('date', axis=1)
eval_tr.loc[:, 'preds'] = all_preds
eval_tr = pd.concat((eval_tr , data_dict['all_aux']),axis=1)

In [16]:
def make_backtest_df(pred_df, feature_df):
    feature_df.loc[pred_df.index, 'pred_score'] = pred_df.preds.values
    return feature_df.reset_index(drop=True)

In [17]:
backtest_df = make_backtest_df(eval_tr, df)

In [18]:
time_df = backtest_df.loc[backtest_df['date'] >= dt.datetime(2002, 1, 1)]
time_df.loc[(time_df.actual_enter == 1) | (time_df.exits == 1)].to_parquet('backtesting/mean_reversion/small_mid_lrg_cap_signals.parquet')
time_df[['date']].drop_duplicates().sort_values('date').to_parquet('backtesting/mean_reversion/small_mid_lrg_cap_dt_rng.parquet')

In [19]:
from joblib import dump, load

dump(rfc, 'rfc_mr.joblib')

['rfc_mr.joblib']

In [None]:
import plotly.graph_objects as go

test_df = eval_tr.loc[eval_tr.norm_profit.notnull() & (eval_tr['date'] > dt.datetime(2011, 1, 1))]

fig = go.Figure()
fig.add_trace(
    go.Scattergl(x=test_df.preds, y=test_df.norm_profit, mode='markers')
)
fig

In [None]:
eval_tr