Results visualizer

In [None]:
# Standard Imports
from IPython.display import display
import numpy as np
import pandas as pd
import pickle
import time
import datetime
import itertools
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib as mpl
from sklearn import metrics
from sklearn import linear_model as lm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from joblib import Parallel, delayed
from mpl_toolkits.mplot3d import Axes3D

# Import Custom Utilities
import thesis_utils as utils
import analysis_utils_3day as au

# Master Params
FIRST_TRADE_STRINGS = [f'{year}-01-01' for year in range(1993, 2023)]
KS = [5, 10, 50, 100]
NUM_TREES = 10
MAX_FEATURES = 6
MAX_DEPTH = 20
LAGS = np.concatenate((np.arange(1,21), (np.arange(2,13) * 20)))
TRAIN_DAYS = 504 
TRADE_DAYS = 252
T_COST = 0.001 # half-turn t-cost (need to scale returns if we are doing other analysis)

# Path for source files
VERSION = '3day'
DATA_PATHNAME = f'Adroit_results_v{VERSION}'
RESULTS_PATHNAME = f'Analysis_v{VERSION}'


# spx
spx_df = pd.read_csv('Data/sp500_1990_2022.csv').set_index('caldt')
spx_df.index = pd.to_datetime(spx_df.index)

# vix df
vix_df = pd.read_csv('Data/VIX_History.csv').set_index('DATE')
vix_df.index.rename('date', inplace=True)
vix_df.index = pd.to_datetime(vix_df.index)

# ff df
ff_df = pd.read_csv('Data/fama_french.csv').set_index('date')
ff_df['MKT_Actual'] = ff_df['MKT'] + ff_df['rf'] # actual mkt returns
ff_df.index = pd.to_datetime(ff_df.index)

# combine vix + ff
extra_factors_list = ['VIX', 'MKT', 'SMB', 'HML', 'RMW', 'CMA']

# Industry list
ind_names_list = ['IND_BusEq', 'IND_Chems', 'IND_Durbl', 'IND_Enrgy', 'IND_Fin', 'IND_Hlth', 'IND_Manuf', 'IND_NoDur', 'IND_Other', 'IND_Shops', 'IND_Telcm', 'IND_Utils']

# Labels
lag_labels = [f"R({i})" for i in LAGS] # Get lag labelsnum preds, based on period lag
labels_extra = lag_labels + extra_factors_list + ind_names_list


period_start = pd.to_datetime('1993-01-01')
period_end = pd.to_datetime('2022-12-31')

# styles
# plt.style.use('bmh')
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams["legend.facecolor"] = 'white'
mpl.rcParams["legend.frameon"] = True
mpl.rcParams["legend.framealpha"] = 0.8
mpl.rcParams['lines.linewidth'] = 1.5


mpl.rcParams['font.family'] = 'Latin Modern Math'
mpl.rcParams['font.size'] = 12
# mpl.rcParams['font.family'] = 'CMU Serif'
mpl.rc('axes', unicode_minus=False)

# import matplotlib
# matplotlib.get_cachedir()

get data

In [None]:
# vif_combined, vif_combined_extra = au.rf_imp_aggregator(1993, 2022, lag_labels, labels_extra, DATA_PATHNAME, VERSION)
# pickle.dump((vif_combined, vif_combined_extra), open(f"vifs_{DATA_PATHNAME}.p", "wb" ))
vif_combined, vif_combined_extra = pickle.load(open(f"vifs_{DATA_PATHNAME}.p", 'rb'))

# accuracies, accuracies_extra, accs_ts_list, accs_extra_ts_list = au.extract_pred_accuracies(1993, 2022, DATA_PATHNAME, VERSION)
# pickle.dump((accuracies, accuracies_extra,  accs_ts_list, accs_extra_ts_list), open(f"accs_{DATA_PATHNAME}.p", "wb" ))
accuracies, accuracies_extra, accs_ts_list, accs_extra_ts_list = pickle.load(open(f"accs_{DATA_PATHNAME}.p", 'rb'))

# a, b, c, d = au.extract_returns(1993, 2022, DATA_PATHNAME, VERSION)
# pickle.dump((a, b, c, d), open(f"return_series_{DATA_PATHNAME}.p", "wb" ))
raw_rets_df, t_cost_rets_df, raw_rets_extra_df, t_cost_rets_extra_df = pickle.load(open(f'return_series_{DATA_PATHNAME}.p', 'rb'))

# slice by 5
raw_rets_df = raw_rets_df.iloc[::3,:]
t_cost_rets_df = t_cost_rets_df.iloc[::3,:]
raw_rets_extra_df = raw_rets_extra_df.iloc[::3,:]
t_cost_rets_extra_df = t_cost_rets_extra_df.iloc[::3,:]
F = 252/3

SUB_PERIODS = {'01/93-12/00':(pd.to_datetime('1993-01-01'), pd.to_datetime('2000-12-31')),
           '01/01-08/08':(pd.to_datetime('2001-01-01'), pd.to_datetime('2008-08-31')),
           '09/08-12/09':(pd.to_datetime('2008-09-01'), pd.to_datetime('2009-12-31')),
           '01/10-12/15':(pd.to_datetime('2010-01-01'), pd.to_datetime('2015-12-31')),
           '01/16-12/19':(pd.to_datetime('2016-01-01'), pd.to_datetime('2019-12-31')),
           '01/20-12/22':(pd.to_datetime('2020-01-01'), pd.to_datetime('2022-12-31'))}

# Weekly-ize the ff df
# ff_df = ff_df.loc[raw_rets_df.index]
ff_df_cum = (1+ff_df).rolling(window=3).agg(lambda x: x.prod())-1
ff_df = ff_df_cum.loc[raw_rets_df.index]
           

Summary for entire period

In [None]:
summary = {}
summary.update({'LR (Before T.C.)':raw_rets_df[['K=10']].mean()})
summary.update({'LR+F (Before T.C.)':raw_rets_extra_df[['K=10']].mean()})
summary.update({'LR (After T.C.)':t_cost_rets_df[['K=10']].mean()})
summary.update({'LR+F (After T.C.)':t_cost_rets_extra_df[['K=10']].mean()})
# summary.update({'MKT': ff_df['MKT_Actual'][raw_rets_df.index].mean()})
sum_df = pd.DataFrame.from_dict(summary).transpose()
sum_df['Annual'] =  (1 + sum_df['K=10']) ** F - 1
for col in sum_df:
    sum_df[col] = sum_df[col].map('{:,.2%}'.format).astype(str)
sum_df
sum_df_tex = sum_df.to_latex(multirow=True).replace("\\\n", "\\ \hline\n")
print(sum_df_tex)

### subperiods

Sup period returns

In [None]:
for i, (key, val) in enumerate(SUB_PERIODS.items()):
    fig, ax = plt.subplots(1,2, figsize=(9,4))
    fig.suptitle(f'{key} Sub-period Returns', fontsize='x-large');

    raw_ts = np.cumsum(np.log(raw_rets_df['K=10'].loc[val[0]:val[1]]+1))
    t_cost_ts = np.cumsum(np.log(t_cost_rets_df['K=10'].loc[val[0]:val[1]]+1))

    raw_extra_ts = np.cumsum(np.log(raw_rets_extra_df['K=10'].loc[val[0]:val[1]]+1))
    t_cost_extra_ts = np.cumsum(np.log(t_cost_rets_extra_df['K=10'].loc[val[0]:val[1]]+1))

    spx_ts = np.cumsum(np.log(ff_df['MKT_Actual'].loc[val[0]:val[1]]+1))
    corr = raw_ts.corr(raw_extra_ts)
    # print(f'{key}_{corr}')
    # raw
    raw_ts.plot(ax=ax[0], label = 'LR')
    raw_extra_ts.plot(ax=ax[0], label ='LR+F')
    spx_ts.plot(ax=ax[0], label='MKT')

    # tc
    t_cost_ts.plot(ax=ax[1], label = 'LR')
    t_cost_extra_ts.plot(ax=ax[1], label ='LR+F')
    spx_ts.plot(ax=ax[1], label='MKT')

    # if ax == axs.flat[0]:
    ax[0].legend()
    ax[0].set_title('Before T.C.')
    ax[0].set_ylabel('Cum. Log Return')
    ax[0].set_xlabel(None)

    ax[1].legend()
    ax[1].set_title('After T.C.')
    ax[1].set_ylabel('Cum. Log Return')
    ax[1].set_xlabel(None)
    # ax.set_yscale(value='log')

    fig.tight_layout()
    fig.savefig(f'{RESULTS_PATHNAME}/{i+1}summary.png', dpi=300) 



In [None]:
fig, ax = plt.subplots(figsize=(10,4))
fig.suptitle('3-day Holding Period Returns, 1993-2022', fontsize='x-large')
lr_ts = np.cumsum(np.log(t_cost_rets_df['K=10']+1))
lrf_ts = np.cumsum(np.log(t_cost_rets_extra_df['K=10']+1))
spx_ts = np.cumsum(np.log(ff_df['MKT_Actual'].loc[period_start:period_end]+1))
lr_ts.plot(ax=ax, label='LR')
lrf_ts.plot(ax=ax, label='LR+F')
spx_ts.plot(ax=ax, label='MKT')
ax.set_ylabel('Cum. Log Return')
ax.set_xlabel(None)
ax.set_xlabel('Time')
ax.legend()
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/whole_subperiod_summary.png', dpi=300) 

Sub period Analysis

In [None]:

# dirty spx trick
spx_dirty = ff_df[['MKT_Actual']]
spx_dirty.columns = ['K=10']


# spx_ts = np.cumsum(np.log(ff_df['MKT'].loc[val[0]:val[1]]+1))
models = {'LR (Before T.C.)': raw_rets_df, 'LR+F (Before T.C.)': raw_rets_extra_df, 'LR (After T.C.)': t_cost_rets_df, 'LR+F (After T.C.)':t_cost_rets_extra_df, 'MKT': spx_dirty}
for idx, (key, val) in enumerate(SUB_PERIODS.items()):
    print(f'{idx+1}__{key}')
    l = []
    for name, model_df in models.items():
        temp = au.get_period_results(model_df ,name, val)
        l.append(temp)
    # display(test)
    # raw_rets_df.loc[SUB_PERIODS['01/93-12/00'][0]:SUB_PERIODS['01/93-12/00'][1]].std()
    l_df = pd.concat(l)
    l_df['Sharpe'] = l_df['Sharpe'].round(2)
    cols_to_pct = ['Mean', 'Std Dev', 'MDD']
    for col in cols_to_pct:
        l_df[col] = l_df[col].map('{:.2%}'.format).astype(str)
    
    l_df = l_df.round(3)
    # display(l_df)
    l_df_tex = l_df.to_latex().replace("\\\n", "\\ \hline\n")
    print(l_df_tex)


T-costs

In [None]:
t_costs = (raw_rets_df - t_cost_rets_df)['K=10']
t_costs_extra = (raw_rets_extra_df - t_cost_rets_extra_df)['K=10']
t_cost_list = {}
for key, val in SUB_PERIODS.items():
    t_cost_list.update({key:t_costs.loc[val[0]:val[1]].mean()})
t_cost_df = pd.DataFrame.from_dict(t_cost_list, orient='index')

t_cost_df.columns = ['LR']
t_cost_df

t_costs_extra_list = {}
for key, val in SUB_PERIODS.items():
    t_costs_extra_list.update({key:t_costs_extra.loc[val[0]:val[1]].mean()})
t_cost_extra_df = pd.DataFrame.from_dict(t_costs_extra_list, orient='index')
t_cost_extra_df.columns = ['LR+F']
res = pd.concat([t_cost_df, t_cost_extra_df], axis=1) * F
for col in res:
        res[col] = res[col].map('{:.2%}'.format).astype(str)
res_tex = res.to_latex().replace("\\\n", "\\ \hline\n")
print(res_tex)
res


In [None]:
fig, ax = plt.subplots(2,1, figsize=(10,5))
fig.suptitle('Weekly Portfolio Turnover', fontsize='x-large')
t_costs = (raw_rets_df - t_cost_rets_df)['K=10']
t_costs_extra = (raw_rets_extra_df - t_cost_rets_extra_df)['K=10']
for key, val in SUB_PERIODS.items():
    t_costs_cut = t_costs.loc[val[0]:val[1]]
    t_costs_extra_cut = t_costs_extra.loc[val[0]:val[1]]
    m = '{:,.3%}'.format(t_costs_cut.mean()/.002)
    m_extra = '{:,.3%}'.format(t_costs_extra_cut.mean()/.002)
    t_costs_cut.div(.002).plot(ax=ax[0], label=f'{key}') # : Mean = {m}
    t_costs_extra_cut.div(.002).plot(ax=ax[1], label=f'{key}') #: Mean = {m_extra}
    ax[0].hlines(t_costs_cut.mean()/.002, val[0], val[1], color='black', linewidth=1.5)
    ax[1].hlines(t_costs_extra_cut.mean()/.002, val[0], val[1], color='black', linewidth=1.5)
ax[0].set_title('LR Model')
ax[1].set_title('LR+F Model')
ax[0].legend(bbox_to_anchor=(1.02, 1.05))
ax[1].legend(bbox_to_anchor=(1.02, 1.05))
ax[0].set_ylabel('Turnover Rate')
ax[1].set_ylabel('Turnover Rate')
ax[0].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[1].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/turnover_periods_new.png', dpi=300) 
print(t_costs.mean()/.002, t_costs_extra.mean()/.002) 

### PRed accuracy

In [None]:
f_date = pd.to_datetime('2023-01-01')
k_keys = ['K=5', 'K=10', 'K=50', 'K=100']
fig, ax = plt.subplots(figsize=(10,5))
fig.suptitle('Annual Directional Prediction Accuracy', fontsize='x-large' )

# for step formatting (dummies)
acc_temp = accuracies.copy()
acc_temp.loc[f_date] = accuracies.iloc[-1]
acc_extra_temp = accuracies_extra.copy()
acc_extra_temp.loc[f_date] = accuracies_extra.iloc[-1]


ax.plot(acc_temp['K=10'], label='LR Model', drawstyle='steps-post') # , color='darkblue' #Lagged Returns Only Model drawstyle='steps-post'
ax.plot(acc_extra_temp['K=10'], label='LR+F Model', drawstyle='steps-post') #color='darkorange'
ax.legend()
ax.set_xlabel('Time')
ax.set_ylabel('Prediction Accuracy')
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.set_xlim(acc_temp.index[0], acc_temp.index[-1])
# ax.set_title(f'{'K'}', fontsize='large')
    # ax.set_title(f'{k}')
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/pred_acc_10k.png', dpi=300) 

# print(acc_temp['K=10'])


In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5), sharex=True)
fig.suptitle('Prediction Accuracy Over Trading Period', fontsize='x-large')

cuts = [x.iloc[::3,:].reset_index(drop=True)['K=10'] for x in accs_ts_list]
wide = pd.concat(cuts,axis=1, ignore_index=True)
wide_avg = wide.mean(axis=1)

# wide_avg_roll = wide_avg.rolling(20).mean()
regr = lm.LinearRegression()
xvals = (wide_avg.dropna().index.to_numpy() + 1).reshape(-1,1)
regr.fit(xvals, wide_avg.dropna().values)
fit_pred = regr.predict(xvals)
# wide_avg_roll.plot(ax=ax[0], label='1-mo MA')
# for c in cuts[1:2]:
#     ax[0].scatter(c.index, c, s=1)
sems = wide.sem(axis=1)
stds = wide.std(axis=1)
# print(stds)
ax[0].errorbar(sems.index, wide_avg, yerr=0.67 *sems, marker = '.', linewidth = 0, elinewidth=1, capsize=2, ecolor='lightskyblue', label='Average Over Simulations')
ax[0].plot(xvals, fit_pred, label='Linear Fit', linewidth=2)
ax[0].set_ylabel('Prediction Accuracy')
ax[0].set_xlabel('Week of Simulation')
ax[0].set_title('LR Model', fontsize='large')
ax[0].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[0].legend()

cuts2 = [x.iloc[::3,:].reset_index(drop=True)['K=10'] for x in accs_extra_ts_list]
wide2 = pd.concat(cuts2,axis=1, ignore_index=True)
wide_avg2 = wide2.mean(axis=1)

# wide_avg_roll2 = wide_avg2.rolling(10).mean()
regr2 = lm.LinearRegression()
xvals2 = (wide_avg2.dropna().index.to_numpy() + 1).reshape(-1,1)
regr2.fit(xvals2, wide_avg2.dropna().values)
fit_pred2 = regr2.predict(xvals2)
# wide_avg_roll2.plot(ax=ax[1], label='1-mo MA')
sems2 = wide2.sem(axis=1)
stds2 = wide2.std(axis=1)
# print(stds)
ax[1].errorbar(sems2.index, wide_avg2, yerr=0.67 * sems2, marker = '.', linewidth = 0, elinewidth=1, capsize=2, ecolor='lightskyblue', label='Average Over Simulations')
# ax[1].fill_between(sems2.index, wide_avg2, wide_avg2-sems2, wide_avg2+sems2, color='r', alpha=0.2)
ax[1].plot(xvals, fit_pred2, label='Linear Fit', linewidth=2)
# ax[1].plot(xvals, fit_pred2, label='Linear Fit')
ax[1].set_ylabel('Prediction Accuracy')
ax[1].set_xlabel('Week of Simulation')
ax[1].set_title('LR+F Model', fontsize='large')
ax[1].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[1].legend()

fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/pred_acc_decay_NEW.png', dpi=300) 

### Vol relations

In [None]:
accs_ts = pd.concat(accs_ts_list)['K=10']
accs_extra_ts = pd.concat(accs_extra_ts_list)['K=10']
li = []
li2 = []
for key, val in SUB_PERIODS.items():
    lr = pd.concat([raw_rets_df['K=10'][val[0]:val[1]], vix_df['CLOSE'][val[0]:val[1]]], axis=1).corr().iloc[1,0]
    lr_abs = pd.concat([np.abs(raw_rets_df['K=10'][val[0]:val[1]]), vix_df['CLOSE'][val[0]:val[1]]], axis=1).corr().iloc[1,0]
    lrf = pd.concat([raw_rets_extra_df['K=10'][val[0]:val[1]], vix_df['CLOSE'][val[0]:val[1]]], axis=1).corr().iloc[1,0]
    lrf_abs = pd.concat([np.abs(raw_rets_extra_df['K=10'][val[0]:val[1]]), vix_df['CLOSE'][val[0]:val[1]]], axis=1).corr().iloc[1,0]
    li.append({'LR Returns': lr, 'LR+F Returns': lrf, '|LR Returns|': lr_abs, '|LR+F Returns|': lrf_abs})
    acc_LR = pd.concat([accs_ts.loc[val[0]:val[1]], vix_df['CLOSE'][val[0]:val[1]]], axis=1).corr().iloc[1,0]
    acc_LRF = pd.concat([accs_extra_ts.loc[val[0]:val[1]], vix_df['CLOSE'][val[0]:val[1]]], axis=1).corr().iloc[1,0]
    # li2.append({'LR Accuracy': acc_LR, 'LR+F Accuracy': acc_LRF, 'LR Returns': lr, 'LR+F Returns': lrf, '|LR Returns|': lr_abs, '|LR+F Returns|': lrf_abs})
    li2.append({'LR Accuracy': acc_LR, 'LR Returns': lr, '|LR Returns|': lr_abs, 'LR+F Accuracy': acc_LRF, 'LR+F Returns': lrf,  '|LR+F Returns|': lrf_abs})
d = pd.DataFrame.from_dict(li)
d.index = SUB_PERIODS.keys()
d = d.round(3)

# print(d.to_latex().replace("\\\n", "\\ \hline\n"))

d2 = pd.DataFrame.from_dict(li2)
d2.index = SUB_PERIODS.keys()
d2 = d2.round(2)
multi_s = pd.MultiIndex.from_product([["LR","LR+F"],['Acc.', 'Ret.', 'Abs. Ret.']]) # multiindex
# print(multi_s) 
d2.columns = multi_s
d2
print(d2.to_latex(multirow=True, multicolumn = True).replace("\\\n", "\\ \hline\n"))

In [None]:
t_costs = (raw_rets_df - t_cost_rets_df)['K=10']
t_costs_extra = (raw_rets_extra_df - t_cost_rets_extra_df)['K=10']
fig, ax = plt.subplots(1, 2, figsize=(10,5))
fig.suptitle('Market Volatility and Weekly Portfolio Turnover', fontsize='x-large')
xrange = np.arange(10, 84).reshape(-1,1)
test_df = pd.concat([t_costs.loc[period_start:period_end],vix_df['CLOSE'].loc[period_start:period_end]], axis=1).dropna()
ax[0].scatter(test_df['CLOSE'], test_df['K=10'] / .002, s = 3)
ax[0].set_xlabel('VIX')
ax[0].set_ylabel('Turnover Rate')
ax[0].set_title('LR Model')
ax[0].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
# fit
fit0 = lm.LinearRegression()
fit0.fit(test_df['CLOSE'].values.reshape(-1,1), test_df['K=10'].values.reshape(-1,1) / .002)
pred0 = fit0.predict(xrange)
ax[0].plot(xrange, pred0, color='orange', linewidth=2)


test_df2 = pd.concat([t_costs_extra.loc[period_start:period_end],vix_df['CLOSE'].loc[period_start:period_end]], axis=1).dropna()
# z = gaussian_kde(test_df2.values)(test_df2.values)
ax[1].scatter(test_df2['CLOSE'], test_df2['K=10'] / .002, s = 3)
ax[1].set_xlabel('VIX')
ax[1].set_ylabel('Turnover Rate')
ax[1].set_title('LR Model')
ax[1].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
# fit
fit1 = lm.LinearRegression()
fit1.fit(test_df2['CLOSE'].values.reshape(-1,1), test_df2['K=10'].values.reshape(-1,1) / .002)
pred1 = fit1.predict(xrange)
ax[1].plot(xrange, pred1, color='orange', linewidth=2)


fig.tight_layout()
print(test_df.corr())
fig.savefig(f'{RESULTS_PATHNAME}/turnover_vol.png', dpi=300) 
print(test_df.corr(),test_df2.corr())

### VIF

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10,6))
fig.suptitle(f'Random Forest Variable Importance', fontsize='x-large')
forest_importances = pd.Series(vif_combined.median(axis=1), index=lag_labels).sort_values(ascending=False) 
forest_importances.plot.bar(ax=ax[0], width=0.8) #, color='darkblue'

ax[0].set_title(f'LR Model, 1993 - 2022', fontsize='large') #Models Using Lagged Returns Only
ax[0].set_xlabel('Variable')
ax[0].set_ylabel('Importance')
forest_importances_extra = pd.Series(vif_combined_extra.median(axis=1), index=labels_extra).sort_values(ascending=False) 
forest_importances_extra.plot.bar(ax=ax[1], width=0.8) # color='darkblue' # IGNORE INDUSTRY [:-12]
ax[1].set_title(f'LR+F Model, 1993 - 2022', fontsize='large')
ax[1].set_xlabel('Variable')
ax[1].set_ylabel('Importance')
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/Mean_VIF.png', dpi=300) 

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(10,7))
fig.suptitle(f'Most Important Variables Over Time', fontsize='x-large')

combined_T = vif_combined.transpose()
combined_medians = combined_T.median(axis=0).sort_values(ascending=False) # get top by mean
combined_T.loc[2023] = combined_T.iloc[-1] # DUMMY ROW FOR PLOTTING PURPOSES....todo: not hardcode this
# ax[0].plot([], []) # null plot for color cycle
combined_T[combined_medians.index[:5]].plot(ax=ax[0],drawstyle='steps-post')
ax[0].set_title(f'LR Strategy', fontsize='large') #'Model Using Lagged Returns Only'
ax[0].legend(title='Variable', bbox_to_anchor=(1.12, 1.05), prop={'size':8})
ax[0].set_xlabel('Time')
ax[0].set_ylabel('Importance')
# ax[0].set_ylim(0.03, 0.05)
ax[0].set_xlim(1993,2023)

combined_extra_T = vif_combined_extra.transpose()
combined_extra_medians = combined_extra_T.median(axis=0).sort_values(ascending=False) # get top by mean
combined_extra_T.loc[2023] = combined_extra_T.iloc[-1] # DUMMY ROW FOR PLOTTING PURPOSES
combined_extra_T[combined_extra_medians.index[:5]].plot(ax=ax[1],drawstyle='steps-post')
ax[1].set_title(f'LR+F Strategy', fontsize='large') # Model Using Lagged Returns + VIX
ax[1].legend(title='Variable', bbox_to_anchor=(1.12, 1.05), prop={'size':8})
ax[1].set_xlabel('Time')
ax[1].set_ylabel('Importance')
# ax[1].set_ylim(0.03, 0.05)
ax[1].set_xlim(1993,2023)
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/VIF_over_time.png', dpi=300) 


 ### aggegation

In [None]:
n = pd.Timestamp('1993-01-01')
# n = t_cost_rets_extra_df.index.get_indexer([date],method='bfill')[0]


sdf = {}
sdf.update({'LR':accuracies.loc[n:].mean()})
sdf.update({'LR+F':accuracies_extra.loc[n:].mean()})
sdf_df = pd.DataFrame.from_dict(sdf).transpose()
for col in sdf_df:
    sdf_df[col] = sdf_df[col].map('{:,.2%}'.format).astype(str)
display(sdf_df.transpose())

rdf = {}
rdf.update({'LR Mean Return': (1+ raw_rets_df.loc[n:].mean() ) **F- 1 })
rdf.update({'LR Std Dev':raw_rets_df.loc[n:].std()*np.sqrt(F)})
rdf.update({'LR+F Mean Return': (1+ raw_rets_extra_df.loc[n:].mean() ) **F - 1 })
rdf.update({'LR+F Std Dev':raw_rets_extra_df.loc[n:].std()*np.sqrt(F)})
rdf.update({'LR Mean Return tc': (1+ t_cost_rets_df.loc[n:].mean() ) **F - 1 })
rdf.update({'LR Std Dev tc':t_cost_rets_df.loc[n:].std()*np.sqrt(F)})
rdf.update({'LR+F Mean Return tc': (1+ t_cost_rets_extra_df.loc[n:].mean() ) **F - 1 })
rdf.update({'LR+F Std Dev tc':t_cost_rets_extra_df.loc[n:].std()*np.sqrt(F)})
rdf_df = pd.DataFrame.from_dict(rdf).transpose()
rdf_df.index= pd.MultiIndex.from_product([['LR Before T.C.', 'LR+F Before T.C.', 'LR After T.C', 'LR+F After T.C.'],['Mean Return', 'Std Dev']])
for col in rdf_df:
    rdf_df[col] = rdf_df[col].map('{:,.2%}'.format).astype(str)
display(rdf_df.transpose())
print(sdf_df.transpose().to_latex(multirow=True, multicolumn = True).replace("\\\n", "\\ \hline\n"))
print(rdf_df.to_latex(multirow=True, multicolumn = True).replace("\\\n", "\\ \hline\n"))

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4), sharex=True)
# mean
fig.suptitle('Returns by Sub-period: h=3 Configuration', fontsize='x-large' )
rets_periods = pd.DataFrame.from_dict({key:(1+t_cost_rets_df.loc[val[0]:val[1]].mean()) ** F-1 for key, val in SUB_PERIODS.items()})
rets_periods.transpose().plot.bar(ax=ax[0], legend=False)
ax[0].set_title('LR Strategy', fontsize='large')
ax[0].set_ylim(-.4,1.7)
ax[0].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[0].set_ylabel('Mean Annual Return')
rets_extra_periods = pd.DataFrame.from_dict({key:(1+t_cost_rets_extra_df.loc[val[0]:val[1]].mean()) ** F-1 for key, val in SUB_PERIODS.items()})
rets_extra_periods.transpose().plot.bar(ax=ax[1])
ax[1].set_title('LR+F Strategy', fontsize='large')
ax[1].set_ylim(-.4,1.7)
ax[1].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[1].legend()
ax[0].legend()
fig.autofmt_xdate()
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/mean_ret_k3.png', dpi=300) 


In [None]:
fig, ax = plt.subplots(figsize=(10,4))
fig.suptitle('Cumulative Returns: h=3 Configuration', fontsize='x-large')
lr_ts = np.cumsum(np.log(t_cost_rets_df['K=10']+1))
lrf_ts = np.cumsum(np.log(t_cost_rets_extra_df['K=10']+1))
spx_ts = np.cumsum(np.log(ff_df['MKT_Actual'].loc[period_start:period_end]+1))
lr_ts.plot(ax=ax, label='LR')
lrf_ts.plot(ax=ax, label='LR+F')
spx_ts.plot(ax=ax, label='MKT')
for idx, (key, val) in enumerate(SUB_PERIODS.items()):
    ax.axvline(x=val[0], color='black', linewidth=1, ls='--')
    ax.text(val[0]-pd.Timedelta(days=50),6.2,f'{idx+1}', fontsize=12) #+pd.Timedelta(days=50)
handles, labs = ax.get_legend_handles_labels()
leg_ = mpl.lines.Line2D([0], [0], label='Sub-period', color='black', ls='--')
handles.extend([leg_])
ax.set_ylabel('Cum. Log Return')
ax.set_xlabel(None)
ax.set_xlabel('Time')
ax.legend(handles=handles)
ax.set_ylim(0,6)
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/whole_subperiod_summary_TRIPLE3.png', dpi=300) 

In [None]:
forest_importances = pd.Series(vif_combined.mean(axis=1), index=lag_labels).sort_values(ascending=False) 
forest_importances_extra = pd.Series(vif_combined_extra.mean(axis=1), index=labels_extra).sort_values(ascending=False) 
imp_c = pd.DataFrame.from_dict({'LR': forest_importances[:10].index, 'LR+F': forest_importances_extra[:10].index})
imp_c.columns = pd.MultiIndex.from_product([['Three Day Holding'],['LR', 'LR+F']])
print(imp_c.to_latex(multirow=True, multicolumn = True).replace("\\\n", "\\ \hline\n"))

In [None]:
f_date = pd.to_datetime('2023-01-01')
k_keys = ['K=5', 'K=10', 'K=50', 'K=100']
fig, axs = plt.subplots(2,2,figsize=(10,4), sharey=True)
fig.suptitle('Annualized Prediction Accuracy: h=3 Configuration', fontsize='x-large' )
for idx, (ax, k) in enumerate(zip(axs.flat,k_keys)):
    # for step formatting (dummies)
    acc_temp = accuracies.copy()
    acc_temp.loc[f_date] = accuracies.iloc[-1]
    acc_extra_temp = accuracies_extra.copy()
    acc_extra_temp.loc[f_date] = accuracies_extra.iloc[-1]


    ax.plot(acc_temp[k], label='LR Model', drawstyle='steps-post') # , color='darkblue' #Lagged Returns Only Model drawstyle='steps-post'
    ax.plot(acc_extra_temp[k], label='LR+F Model', drawstyle='steps-post') #color='darkorange'
    
    ax.legend(fontsize=10)
    ax.set_xlabel(None)
    if idx == 0 or idx == 2:
        ax.set_ylabel('Prediction Accuracy')
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    ax.set_xlim(acc_temp.index[0], acc_temp.index[-1])
    ax.set_title(f'{k}', fontsize='large')
    # ax.set_title(f'{k}')
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/pred_acc_combined3.png', dpi=300) 
    

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10,5))
fig.suptitle('Daily Portfolio Turnover (2-mo MA)', fontsize='x-large')
t_costs = (raw_rets_df - t_cost_rets_df)['K=10']
t_costs_extra = (raw_rets_extra_df - t_cost_rets_extra_df)['K=10']
for key, val in SUB_PERIODS.items():
    t_costs_cut = t_costs.loc[val[0]:val[1]]
    t_costs_extra_cut = t_costs_extra.loc[val[0]:val[1]]
    m = '{:,.3%}'.format(t_costs_cut.mean()/.002)
    m_extra = '{:,.3%}'.format(t_costs_extra_cut.mean()/.002)
    t_costs_cut.div(.002).rolling(40).mean().plot(ax=ax[0], label=f'{key}') # : Mean = {m}
    t_costs_extra_cut.div(.002).rolling(40).mean().plot(ax=ax[1], label=f'{key}') #: Mean = {m_extra}
    ax[0].hlines(t_costs_cut.mean()/.002, val[0], val[1], color='black', linewidth=1.5)
    ax[1].hlines(t_costs_extra_cut.mean()/.002, val[0], val[1], color='black', linewidth=1.5)
ax[0].set_title('LR Model')
ax[1].set_title('LR+F Model')
ax[0].legend(bbox_to_anchor=(1.02, 1.05))
ax[1].legend(bbox_to_anchor=(1.02, 1.05))
ax[0].set_ylabel('Turnover Rate')
ax[1].set_ylabel('Turnover Rate')
ax[0].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[1].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
fig.tight_layout()
fig.savefig(f'{RESULTS_PATHNAME}/turnover_periods_new.png', dpi=300) 
print(t_costs.mean()/.002, t_costs_extra.mean()/.002) 