In [215]:
import pandas as pd
import numpy as np
import itertools
from sklearn.ensemble import GradientBoostingClassifier
from joblib import Parallel, delayed
from tqdm import tqdm
import time 

In [216]:
def fitGBDT(train_df: pd.DataFrame,
            lhs_col: str,
            rhs_cols: list,
            hps: dict) -> GradientBoostingClassifier: 
    ''' Fit GBDT on given training data using RHS columns to predict LHS column with given hps.

    Args:
        train_df (pd.DataFrame): training data.
        lhs_col (str): LHS column to predict.
        rhs_cols (list): list of strings of RHS features.
        hps (dict): key value pairs for hyperparameters to set including: `learning_rate`, 
                    `num_estimators`, `subsample`, `min_samp_split`, `max_depth`, and `max_feature`.

    Returns:
        (GradientBoostingClassifier): fitted model.
    '''
    # obtain the rhs and lhs data
    train_rhs   = train_df[rhs_cols].values.astype('float32')
    train_lhs   = train_df[lhs_col].values.reshape(-1).astype('int')
    
    # build the model
    model = GradientBoostingClassifier(loss='log_loss',
        learning_rate=hps['learning_rate'],
        n_estimators=hps['num_estimator'],
        subsample=hps['subsample'],
        min_samples_split=hps['min_samp_split'],
        max_depth=hps['max_depth'],
        max_features=hps['max_feature'],
        verbose=0,
        random_state=int(hps['learning_rate']
            *hps['num_estimator']
            *hps['subsample']
            *hps['min_samp_split']
            *hps['max_depth']
            *hps['max_feature']))

    # calculate sample weights as linearly spaced from 0 to max value s.t. it sums to one
    num_samples = train_lhs.shape[0]
    weights = np.arange(0, num_samples)
    weights = weights/np.sum(weights)
    epsilon = weights[1]/2
    weights[0] = epsilon
    weights[-1] -= epsilon

    # fit
    model.fit(X=train_rhs, 
        y=train_lhs, 
        sample_weight=weights)

    return model

def genGBDTYhats(oos_df: pd.DataFrame, 
    model: GradientBoostingClassifier, 
    lhs_col: str, 
    rhs_cols: list) -> np.array: 
    """ Generate predicted probabilities for input data using a gradient boosting classifier model.

    Args:
        oos_df (pd.DataFrame): out of sample data to fit on.
        model (GradientBoostingClassifier): trained gradient Boosting model.
        lhs_col (str): name of the column containing the label data.
        rhs_cols (list): column names containing the feature data.

    Returns:
        (np.array)): array of predicted probabilities.
    """
    # obtain the RHS data
    oos_rhs   = oos_df[rhs_cols].values.astype('float32')
    
    # form the yhats
    yhats = model.predict_proba(oos_rhs)
    
    # Return results
    return yhats[:,1] # second column corresponds to probability of label 1

In [270]:
def runCV(df: pd.DataFrame, 
    val_start_date: str,
    num_cpus: int,
    arch_name: str, 
    out_csv_fp: str) -> list:
    ''' run step-forward cross validation to select optimal hyperparameters for target model 
        returning fitting yhats and models as well as outputting results to csv.

    Args:
        df (pd.DataFrame): panel of training and val data with date index, LHS variable labeled `y`, 
                           and remaining cols are RHS features.
        val_start_date (str): the first date for the validation period.
        val_end_date (str): the last date for the validation period.
        num_cpus (int): number of cpus to use when parallelizing.
        arch_name (str): name of architecture to use when saving intermitent results.
        out_csv_fp (str): filepath to output the csv file without `.csv' on end.
          
    Returns:
        (list): list of dictionaries for each hyperparameter fit where each list contains keys of:
                    `hps` is dict of hyperparameter combination,
                    `yhats` is an array of validation period yhats, and,
                    `models` is list of fitted models.  
    '''
    # initialize args
    val_dates = np.unique(df[val_start_date:].index.values)
    results_list = []
    csv_dict_list = []
    lhs_col  = 'y'
    rhs_cols = list(df.columns.values)
    rhs_cols.remove(lhs_col)

    # initialize hp grid for gbdt
    learning_rates  = [5e-2, 1e-3, 1e-4] # TODO wide grid and then fine grid
    num_estimators  = [100, 250]  # TODO do big grid # TODO do fine grid
    subsamples      = [0.99] # TODO big grid and then fine grid
    min_samp_splits = [0.001, 0.1,  0.5] 
    max_depths      = [2, 3, 4] # TODO do deeper
    max_features    = [0.1, 0.2, 0.3] # TODO FINE GRID

    # loop over all hp combos
    for hps in itertools.product(learning_rates,
                                 num_estimators,
                                 subsamples,
                                 min_samp_splits,
                                 max_depths, 
                                 max_features):
        # initialize args
        results_dict = {}
        results_dict['hps'] = {'learning_rate': hps[0],
            'num_estimator': hps[1],
            'subsample': hps[2],
            'min_samp_split': hps[3],
            'max_depth': hps[4],
            'max_feature': hps[5]}
        print(results_dict['hps'], '\n') # monitor progress

        # fit model on all val dates
        tic = time.perf_counter()
        def loopOverValDates(val_date): # set up as a func to loop over
            # form train and val data
            train_df = df[df.index < val_date].copy()
            val_df   = df[df.index == val_date].copy()

            # fit model and generate yhats for val week
            model = fitGBDT(train_df, lhs_col, rhs_cols, hps=results_dict['hps'])
            yhats = genGBDTYhats(val_df, model, lhs_col, rhs_cols)

            return yhats, model
        
        val_results = Parallel(n_jobs=num_cpus)(delayed(loopOverValDates)(val_date) for val_date in tqdm(val_dates))

        # extract validation periods results
        yhats_list = []
        models_list = []
        for t in range(len(val_results)):
            yhats_list.append(val_results[t][0])
            models_list.append(val_results[t][1])
        results_dict['yhats'] = np.array(yhats_list)
        results_dict['models'] = models_list

        # save results to master result list
        results_list.append(results_dict)

        # save results to csv to monitor during cv
        toc = time.perf_counter()
        
        csv_dict = results_dict['hps'].copy()
        csv_dict['arch_name'] = arch_name
        csv_dict['val_start_date'] = val_start_date
        csv_dict['val_end_date'] = np.datetime_as_string(np.max(df.index.values))[:10]
        csv_dict['runtime_mins'] = round((toc - tic)/60, 0) 
        csv_dict['accuracy'] = np.sum(np.where(results_dict['yhats'] > 0.5, 1, 0).reshape(-1)
            == df[val_start_date:].y.values)/df[val_start_date:].shape[0]
        csv_dict_list.append(csv_dict)
        cv_df = pd.DataFrame(csv_dict_list)

        timestr = time.strftime("%Y%m%d_%H%M%S")
        fp = out_csv_fp+'_'+arch_name+'_'+timestr+'.csv'
        cv_df.to_csv(fp, index=False)

        # output results to track
        print(csv_dict['runtime_mins'])
        print('accuracy:'+str(csv_dict['accuracy']))
        print('\n\n\n')

    return results_list

In [271]:
if __name__ == "__main__":
    # set args
    in_fp = '../1-data/clean/bars_btceth_6hour.pkl'
    cv_out_fp = '../3-output/cv_results'

    # read in and prep training+validation data
    df = pd.read_pickle(in_fp)
    df = df.set_index('date')
    df = df.astype('float32')
    df = df.drop('y_btc_eth_diff_r_tp5_tp370', axis=1)
    df = df[:'2022-03-31']
    
    # run the cv
    cv_results = runCV(df, 
        val_start_date='2021-07-31',
        num_cpus=20, 
        arch_name='gbdt', 
        out_csv_fp=cv_out_fp)


{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 2, 'max_feature': 0.1} 




[A
  9%|▉         | 99/1096 [18:12<3:03:20, 11.03s/it]
 59%|█████▉    | 579/976 [08:20<05:43,  1.16it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 976/976 [02:54<00:00,  5.59it/s]


3.0
accuracy:0.5235655737704918




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 2, 'max_feature': 0.2} 



100%|██████████| 976/976 [05:36<00:00,  2.90it/s]


6.0
accuracy:0.5184426229508197




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 2, 'max_feature': 0.3} 



100%|██████████| 976/976 [08:20<00:00,  1.95it/s]


9.0
accuracy:0.5225409836065574




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 3, 'max_feature': 0.1} 



100%|██████████| 976/976 [04:09<00:00,  3.91it/s]


4.0
accuracy:0.492827868852459




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 3, 'max_feature': 0.2} 



100%|██████████| 976/976 [08:13<00:00,  1.98it/s]


9.0
accuracy:0.5010245901639344




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 3, 'max_feature': 0.3} 



100%|██████████| 976/976 [12:15<00:00,  1.33it/s]


13.0
accuracy:0.5174180327868853




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 4, 'max_feature': 0.1} 



100%|██████████| 976/976 [05:25<00:00,  3.00it/s]


6.0
accuracy:0.5153688524590164




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 4, 'max_feature': 0.2} 



100%|██████████| 976/976 [10:45<00:00,  1.51it/s]


11.0
accuracy:0.507172131147541




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.001, 'max_depth': 4, 'max_feature': 0.3} 



100%|██████████| 976/976 [16:03<00:00,  1.01it/s]


17.0
accuracy:0.514344262295082




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 2, 'max_feature': 0.1} 



100%|██████████| 976/976 [02:53<00:00,  5.62it/s]


3.0
accuracy:0.5040983606557377




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 2, 'max_feature': 0.2} 



100%|██████████| 976/976 [05:36<00:00,  2.90it/s]


6.0
accuracy:0.5133196721311475




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 2, 'max_feature': 0.3} 



100%|██████████| 976/976 [08:17<00:00,  1.96it/s]


9.0
accuracy:0.492827868852459




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 3, 'max_feature': 0.1} 



100%|██████████| 976/976 [04:07<00:00,  3.94it/s]


4.0
accuracy:0.5297131147540983




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 3, 'max_feature': 0.2} 



100%|██████████| 976/976 [08:05<00:00,  2.01it/s]


9.0
accuracy:0.5225409836065574




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 3, 'max_feature': 0.3} 



100%|██████████| 976/976 [12:05<00:00,  1.35it/s]


13.0
accuracy:0.5235655737704918




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 4, 'max_feature': 0.1} 



100%|██████████| 976/976 [05:14<00:00,  3.10it/s]


6.0
accuracy:0.5010245901639344




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 4, 'max_feature': 0.2} 



100%|██████████| 976/976 [10:28<00:00,  1.55it/s]


11.0
accuracy:0.5112704918032787




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.1, 'max_depth': 4, 'max_feature': 0.3} 



100%|██████████| 976/976 [15:35<00:00,  1.04it/s]


17.0
accuracy:0.5102459016393442




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.5, 'max_depth': 2, 'max_feature': 0.1} 



100%|██████████| 976/976 [02:42<00:00,  6.02it/s]


3.0
accuracy:0.5153688524590164




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.5, 'max_depth': 2, 'max_feature': 0.2} 



100%|██████████| 976/976 [05:15<00:00,  3.09it/s]


6.0
accuracy:0.5194672131147541




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.5, 'max_depth': 2, 'max_feature': 0.3} 



100%|██████████| 976/976 [07:51<00:00,  2.07it/s]


8.0
accuracy:0.5133196721311475




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.5, 'max_depth': 3, 'max_feature': 0.1} 



100%|██████████| 976/976 [03:37<00:00,  4.48it/s]


4.0
accuracy:0.5163934426229508




{'learning_rate': 0.05, 'num_estimator': 100, 'subsample': 0.99, 'min_samp_split': 0.5, 'max_depth': 3, 'max_feature': 0.2} 



  8%|▊         | 80/976 [00:25<05:15,  2.84it/s]

##### Backtest:
    - fit GBDT predicting long or short
    - execute an expanding window CV across all GBDT hyperparameters
        - initialize training data as 2016 thru Q2 2021 and validation data is Q3 2021 through Q1 2022
        - fit on training data and predict on first observation of validation data
        - reset training data to include the next row from the validation data and re-fit+predict.
        - generate yhats for the entire validation period.
    - fit lasso in sample way across 10 different lambdas, take lasso that performs best, fit linear reg on those feats as a benchmark for GBDT to out perform; use lin reg of gbdt dont outperform dis.
    - portfolio optimization: 
        - ex post take all the yhats to form a trading rule mapping from this vector of yhats to vector of 5,4,3,2,1,0,-1,-2,-3,-4,-5 for 5x long to 5x short. 
        - so basically where do i draw these 10 lines in my dist of 0 to 1? 
        - max returns or max sharpe ratio by picking these 10 points, monotoically, between 0 and 1 to create vector of positions. 
        - adjust all of this to take out transaction costs both open+close costs as well as margin costs. maybe these two costs are just vectors as well.
    - calc avg return at 6 hour freq, total return over val period, geom avg return annualized, and sd of simple returns annualized.
    - select highest geom avg return and also one with highest sharpe
    - confirm both have geom avg return statistically different from zero.
        - bootstrap some portfolio return s.e. to calc t stat for backtest return to see if stat sig diff from zero. 
        - given my benchmark is zero. 
        - ensure my validation period is big enough that i could get sig result, i.e. small enough S.E. just randomly generate 10k vectors of positions to calc geom average return; form empirical dist; take standard deviation of this dist to get the s.e. 
        - adjust all of this to take out transaction costs
    - for selected model, go fit in true out of sample period of Q2-Q4 2022 to confirm again geom avg return is stat different from zero.

All of the below code is old code that I will delete once I have completed this backtest file.

In [None]:
# TODO SOME BOOTSTRAP CODE TO USE LATER

# generate 100 vectors of length 1096 selecting from [-5,-4,-3,-2,-1,0,1,2,3,4,5]
bs_size = 100
periods = 1096
positions = [-5,-4,-3,-2,-1,0,1,2,3,4,5]
portfolios_list = []
for i in range(bs_size):
    np.random.seed(i)
    portfolios_list.append(np.random.choice(positions, size=periods))

# calc returns
btc_eth_diff_array = df[(df.date >= '2021-07-01') & (df.date < '2022-04-01')].y_btc_eth_diff_r_tp5_tp370.values
returns_list = []
for i in range(bs_size):
    portfolio = portfolios_list[i]
    geom_return = np.prod(1+portfolio*btc_eth_diff_array)**(1/periods)-1
    returns_list.append(geom_return)
# calc s.e. of dist
# think thru how to confirm this is enough data

In [1]:
# IMPORT PACKAGES
import pandas as pd
import numpy as np
import itertools
import time
import pickle
from dateutil.relativedelta import relativedelta
from datetime import datetime
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from tqdm import tqdm


In [None]:
def fitOLS(train_df, lhs_col, rhs_cols):
    # Obtain the training input and output data and, if passed, validation data
    train_rhs   = train_df[rhs_cols].values.astype('float32')
    train_lhs   = train_df[lhs_col].values.reshape(-1).astype('int')

    # add constant
    train_rhs = np.concatenate((np.ones([len(train_lhs),1]), train_rhs),
                               axis=1)

    # fit
    betahat = np.matmul(np.linalg.inv(np.matmul(train_rhs.T, train_rhs)),
                        np.matmul(train_rhs.T, train_lhs))

    return betahat


def genOLSYhats(oos_df, model, lhs_col, rhs_cols):    
    # Obtain the RHS data
    oos_rhs   = oos_df[rhs_cols].values.astype('float32')
    oos_lhs   = oos_df[lhs_col].values.reshape(-1).astype('int')
    
    # add constant
    oos_rhs   = np.concatenate((np.ones([len(oos_lhs),1]), oos_rhs),
                                axis=1).reshape(-1)
    
    # Form the yhats
    yhats = np.dot(model, oos_rhs)
    
    # Return results
    return pd.DataFrame(data={'date': oos_df.index.values,
                              'y': np.array(oos_df['y'].values),
                              'yhats': yhats})

In [None]:
def fitLasso(train_df, hps_yhats_dict, lhs_col, rhs_cols):
    # Extract hps
    alpha = hps_yhats_dict['alpha']

    # Obtain the training input and output data and, if passed, validation data
    train_rhs   = train_df[rhs_cols].values.astype('float32')
    train_lhs   = train_df[lhs_col].values.reshape(-1).astype('int')

    # standardize
    scaler = StandardScaler()
    train_rhs = scaler.fit_transform(train_rhs)

    # fit
    model = linear_model.Lasso(alpha=alpha, 
                               selection='cyclic')
    model.fit(train_rhs, train_lhs)

    # gather what coefs were used
    used_coefs = (model.coef_!=0)*1

    return model, used_coefs, scaler

def genLassoYhats(oos_df, model, lhs_col, rhs_cols, scaler):    
    # Obtain the RHS data
    oos_rhs   = oos_df[rhs_cols].values.astype('float32')
    oos_lhs   = oos_df[lhs_col].values.reshape(-1).astype('int')
    
    # Standarize
    oos_rhs   = scaler.transform(oos_rhs)
    
    # Form the yhats
    yhats = model.predict(oos_rhs)
    
    # Return results
    return pd.DataFrame(data={'date': oos_df.index.values,
                              'y': np.array(oos_df['y'].values),
                              'yhats': yhats})

In [None]:
def runCV(df, last_train_year=2018, val_end_year=2021, arch_name=None, num_cpus=10):
    #Initialize hp result objects
    results_list    = []
    hps_mse_df_list = []

    # determine columns
    all_cols = list(df.columns.values)
    lhs_col  = ['y']
    all_cols.remove('y')
    rhs_cols = all_cols
    assert(df.shape[1]==(len(lhs_col)+len(rhs_cols))) # lhs col + rhs cols

    # Initialize the hyperparameter grid for GBDT
    learning_rates  = [0.1, 0.05, 0.01] # 0.1, 0.05, 0.01 # [5e-2, 5e-3, 5e-4, 5e-5]
    num_ests        = [250] # [100, 250, 500, 1000] 
    subsamples      = [0.99]  # [0.95, 0.99, 0.999]
    min_samp_splits = [0.005] # [0.001, 0.005, 0.01]
    max_depths      = [2, 3, 4] # [2, 3, 4, 5]
    max_features    = [0.1, 0.15, 0.2] # [0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25]

    # Initialize the hyperparameter grid for Lasso
    alphas          = [5e-3] # [1e-2, 5e-3, 1e-3, 5e-4, 1e-4, 5e-5, 1e-5]

    # Initialize hyperparameter grid for OLS
    window_sizes    = [1000, 5000, 10000, 20000, 35000, 50000, 65000]

    # Determine the dates in the validation window  
    val_dates = np.unique(df[(df.index.year > last_train_year)  
                             & (df.index.year <= val_end_year)].index.values) 


    # Generate yhats for every hyperparameter grid point
    #     for hps in itertools.product(learning_rates,
    #                                  num_ests,
    #                                  subsamples,
    #                                  min_samp_splits,
    #                                  max_depths, 
    #                                  max_features):
    #         # build the hp and yhats dictionary
    #         hps_yhats_dict = {'learning_rate': hps[0],
    #                           'num_est': hps[1],
    #                           'subsample': hps[2],
    #                           'min_samp_split': hps[3],
    #                           'max_depth': hps[4],
    #                           'max_feature': hps[5],
    #                           'results_df': pd.DataFrame()}
    for hps in itertools.product(window_sizes):
        hps_yhats_dict = {'window_size': hps[0],
                          'results_df': pd.DataFrame()}

        # fit on all val dates
        print(hps_yhats_dict, '\n')
        tic = time.perf_counter()
        def loopOverValDates(val_date):
            # Form train and validation data frames
            window_size = hps_yhats_dict['window_size']
            temp_df  = df[lhs_col+rhs_cols][df.index <= val_date]
            train_df = temp_df[temp_df.index < val_date][-window_size:]
            val_df   = temp_df[temp_df.index == val_date]

            # fit model and generate yhats for val week
            model          = fitOLS(train_df, lhs_col, rhs_cols)
            val_results_df = genOLSYhats(val_df, model, lhs_col, rhs_cols)

            # Return this date's results
            return val_results_df, model

        # Fit on all the val dates
        val_results = Parallel(n_jobs=num_cpus)(delayed(loopOverValDates)(val_date) for val_date in tqdm(val_dates))

        # Extract val results
        val_results_df = pd.DataFrame()
        beta_hats       = np.zeros((len(rhs_cols)+1))
        for j in range(len(val_results)):
            val_results_df = pd.concat((val_results_df, val_results[j][0]))
            beta_hats      += val_results[j][1]
        hps_yhats_dict['results_df'] = val_results_df
        hps_yhats_dict['beta_hats']  = beta_hats/len(val_dates)



#         def loopOverValDates(val_date):
#             # Form train and validation data frames
#             temp_df  = df[lhs_col+rhs_cols][df.index <= val_date]
#             train_df = temp_df[temp_df.index < val_date]
#             val_df   = temp_df[temp_df.index == val_date]

#             # Fit model and generate yhats for val week
#             model          = fitGBDT(train_df, hps_yhats_dict, lhs_col, rhs_cols)
#             val_results_df = genYhats(val_df, model, lhs_col, rhs_cols)

#             # Obtain training data accuracy for studying over/underfit
#             train_acc = model.score(train_df[rhs_cols].values, train_df['y'].values)

#             # Return this date's results
#             return val_results_df, train_acc

#         # Fit on all the val dates
#         val_results = Parallel(n_jobs=num_cpus)(delayed(loopOverValDates)(val_date) for val_date in tqdm(val_dates))

#         # Extract val results
#         val_results_df = pd.DataFrame()
#         train_accs     = []
#         for j in range(len(val_results)):
#             val_results_df = pd.concat((val_results_df, val_results[j][0]))
#             train_accs.append(val_results[j][1])
#         hps_yhats_dict['results_df'] = val_results_df
#         hps_yhats_dict['avg_train_acc'] = np.mean(train_accs)

        # Save run time and space out result print out
        toc = time.perf_counter()
        print('\n\n\n')

        # Update this HP point's results and append to list of results
        # ys    = hps_yhats_dict['results_df'].y.values
        # yhats = (hps_yhats_dict['results_df'].yhats.values>0)*1
        # yhats[yhats==0] = -1
        # oos_acc = np.sum(ys==yhats)/len(yhats)
        # hps_yhats_dict['oos_acc'] = oos_acc
        results_list.append(hps_yhats_dict)

        # Save the ongoing results as a csv to be able to watch
        cv_results_dict = hps_yhats_dict.copy()
        del cv_results_dict['results_df']
        if 'models' in cv_results_dict:
            del cv_results_dict['models']
        cv_results_dict['runtime_mins']     = round((toc - tic)/60, 0)  
        cv_results_dict['arch_name']        = arch_name
        cv_results_dict['first_val_year']   = str(last_train_year+1) 
        cv_results_dict['first_train_year'] = str(np.min(df.index.year))

        #temp_df = hps_yhats_dict['results_df'].copy()
        # ys      = temp_df[temp_df.date.dt.year==2021][lhs_col].values.reshape(-1)
        # yhats   = temp_df[temp_df.date.dt.year==2021].yhats.values
        # yhats   = (yhats>0)*1
        # yhats[yhats==0] = -1
        # cv_results_dict['oos_acc_2021'] = np.sum(ys==yhats)/len(yhats)

#         hps_mse_df_list.append(pd.DataFrame(cv_results_dict, index=[0]))
#         cv_df = pd.concat(hps_mse_df_list, ignore_index=True)

#         timestr = time.strftime("%Y%m%d_%H%M%S")
#         fp = '../4-output/cv-results-' + arch_name +'-' + timestr + '.csv'
#         cv_df.to_csv(fp, index=False)

    return results_list

In [None]:
def labelPortfolioWeights(df):
    # set parameters for tertile weights
    # note: can't go to zero on any as there may be week where that is only yhat
    bottom_tertile = 1/6
    mid_tertile = 1/3
    top_tertile = (1-bottom_tertile-mid_tertile)

    # create portfolio weights
    df = df.sort_values(by=['date', 'yhat'])
    df['counts']  = 1
    df['total_assets_per_week_tertile'] = df.groupby(['date', 'yhat']).counts.transform('sum')
    df.loc[df.yhat == -1, 'prtfl_wght'] = bottom_tertile / df[df.yhat == -1].total_assets_per_week_tertile
    df.loc[df.yhat == 0,  'prtfl_wght'] = mid_tertile / df[df.yhat == 0].total_assets_per_week_tertile
    df.loc[df.yhat == 1,  'prtfl_wght'] = top_tertile / df[df.yhat == 1].total_assets_per_week_tertile

    # clean up
    df = df.drop(['counts', 'total_assets_per_week_tertile'], axis=1)

    # fix weeks where portfolio weights add up to less than 1
    temp_df = df.groupby(['date'])[['prtfl_wght']].sum()
    temp_df = temp_df[~np.isclose(temp_df.prtfl_wght, 1)]
    if 0<temp_df.shape[0]:
        for i in range(temp_df.shape[0]):
            date = temp_df.index.values[i]
            total_weight = temp_df.prtfl_wght.values[i]
            df.loc[df.index == date, 'prtfl_wght'] = df[df.index==date].prtfl_wght * (1/total_weight)
            
    # confirm portfolio weights roughly sum to 1 for each week
    assert(len(np.unique(df.index)) == 
           np.sum(np.isclose(df.groupby(['date']).prtfl_wght.sum(), 1,
                             rtol=1e-2, atol=1e-2)))

    return df


In [None]:
def calcAnnualTransactionCosts(df):
    # merge on the previous week's holdings to the new holdings
    temp_df = df.copy()
    temp_df = temp_df[temp_df.index<np.max(temp_df.index)]
    temp_df.index = temp_df.index+np.timedelta64(1, 'D')
    temp_df = temp_df[['asset', 'prtfl_wght']]
    temp_df = temp_df.rename(columns={'prtfl_wght': 'prtfl_wght_tm7'})
    df      = df.merge(temp_df,
                       on=['date', 'asset'],
                       how='outer',
                       validate='one_to_one')

    # calc weekly turnover and ensure it has the appropriate range
    df['asset_to'] = np.abs(df.prtfl_wght - df.prtfl_wght_tm7)
    to_df = df.groupby('date')[['asset_to']].sum().reset_index()
    assert((np.min(to_df.asset_to)>=0) & (np.max(to_df.asset_to<=2)))

    # correct the first and last week valid for buying the initial port and liquidating
    to_df.loc[0, 'asset_to'] = 1
    to_df = pd.concat((to_df, pd.DataFrame(data={'date': np.max(temp_df.index.values)+np.timedelta64(1, 'D'),
                                                 'asset_to': 1}, index=[0])))
    to_df = to_df.reset_index(drop=True)

    # add transaction costs assuming maker and taker fee of 20 bps each
    to_df['tc'] = to_df.asset_to*0.002

    # return annualize transaction cost
    return -np.sum(to_df.tc)/(to_df.shape[0]/52)

In [None]:
def calcPortfolioReturn(r_df):
    num_days = r_df.shape[0]
    if np.sum(r_df.r.values <= -1)>=1:
        return -1
    else:
        tot_ret   = np.product(r_df.r.values+1)-1
        daily_ret = (tot_ret+1)**(1/num_days)-1
        annl_ret  = (daily_ret+1)**(365.25)-1
        return annl_ret

In [None]:
def calcPortfolioSharpe(r_df):
    daily_sharpe = np.mean(r_df.r.values)/np.std(r_df.r.values)
    annl_sharpe  = daily_sharpe*np.sqrt(365.25)
    return annl_sharpe

In [None]:
def max_draw_down(r_df):
    cumulative_ret=(r_df.r+1).cumprod()
    roll_max=cumulative_ret.rolling(len(cumulative_ret), min_periods=1).max()
    daily_drawdown=cumulative_ret/roll_max
    max_daily_drawdown=daily_drawdown.min() - 1
    return max_daily_drawdown

In [None]:
def max_1_week_loss(r_df):
    max_loss=(r_df['r']+1).rolling(7).apply(np.prod)
    max_loss_minus=max_loss.min()-1
    return max_loss_minus

In [None]:
def genTestYhats(df, opt_hps, test_year=2022): 
    test_dates = np.unique(df[df.index.year == test_year].index.values)

    all_cols = list(df.columns.values)
    lhs_col  = ['y']
    all_cols.remove('y')
    rhs_cols = all_cols

    def loopOverOOSDates(test_date):
        # build data
        temp_df       = df[df.index <= test_date].copy()
        train_df      = temp_df[temp_df.index < test_date]
        oos_df        = temp_df[temp_df.index == test_date]

        # fit and predict
        model          = fitGBDT(train_df, opt_hps, lhs_col, rhs_cols)
        oos_results_df = genYhats(oos_df, model, lhs_col, rhs_cols)

        return oos_results_df

    # run
    oos_results = Parallel(n_jobs=num_cpus)(delayed(loopOverOOSDates)(test_date) for test_date in tqdm(test_dates))

    # Extract oos results
    oos_results_df = pd.concat(oos_results)

    return oos_results_df

In [None]:
# RUN THE CODE

# Load in the data
input_fp = '../3-data/clean/panel_btceth_30min.pkl'
df = pd.read_pickle(input_fp)

# Clean up the data 
df = df.set_index('date')
df = df.astype('float32')

# Select useful variables accoridng to Lasso
df = df[['y',
         'covar_btc_volume_sum_tm288',
         'covar_btc_r_tm36',
         'covar_btc_ema_144_t',
         'covar_btc_ema_288_t',
         'covar_eth_r_tm12',
         'covar_eth_ma_24_t',
         'covar_btc_rsi_tm288',
         'covar_eth_rsi_tm2016',
         'covar_eth_rsi_tm4032',
         'macro_mcap_ret_ath_t',
         'macro_mcap_altcoin_ret_ath_t']]

In [None]:
# Run CV
results_list = runCV(df, last_train_year=2019, val_end_year=2021,
                     arch_name='lasso_longshort', num_cpus=20)


In [None]:
if __name__ == "__main__":
    # step 1

    # step 2

    # step 3

    # do gbdt with the full set where i randomly sample sqrt of them and then try with the 36 cols from lasso where i sample half or something
    
    # TODO explore if normalization helps or not and make sure no forward looking bias
# played with the lasso code below to select a final set of 36 features
    lasso_feats =  ['covar_btc_covar_trades_t_ma_tm8640',
                    'covar_btc_covar_trades_t_min_tm30',
                    'covar_btc_covar_trades_t_min_tm60',
                    'covar_btc_covar_volume_t_min_tm4320',
                    'covar_btc_p_log_t',
                    'covar_btc_r_1min_ema_tm360',
                    'covar_btc_r_5min_skew_tm720',
                    'covar_btc_r_tm1440',
                    'covar_btc_r_tm15',
                    'covar_btc_r_tm86400',
                    'covar_btc_rsi_tm720',
                    'covar_eth_covar_trades_t_max_tm4320',
                    'covar_eth_covar_trades_t_min_tm20',
                    'covar_eth_covar_trades_t_vol_tm60',
                    'covar_eth_covar_volume_t_ma_tm60',
                    'covar_eth_covar_volume_t_ma_tm8640',
                    'covar_eth_covar_volume_t_max_tm4320',
                    'covar_eth_covar_volume_t_max_tm720',
                    'covar_eth_covar_volume_t_sum_tm20',
                    'covar_eth_covar_volume_t_sum_tm60',
                    'covar_eth_covar_volume_t_vol_tm360',
                    'covar_eth_p_t',
                    'covar_eth_r_1min_ema_tm60',
                    'covar_eth_r_1min_ma_tm360',
                    'covar_eth_r_1min_vol_tm20',
                    'covar_eth_r_1min_vol_tm5',
                    'covar_eth_r_5min_ma_tm20160',
                    'covar_eth_r_5min_ma_tm60',
                    'covar_eth_r_5min_min_tm30',
                    'covar_eth_r_5min_min_tm720',
                    'covar_eth_r_cummax_t',
                    'covar_eth_r_cummin_t',
                    'covar_eth_r_tm120',
                    'covar_eth_r_tm1440',
                    'covar_eth_r_tm40320',
                    'covar_eth_volume_t']

In [None]:
results_list_master = results_list.copy()

In [None]:
# # SAVE RESULTS
# with open('../3-data/derived/validation_2021_results_btceth_ols.pickle', 'wb') as handle:
#     pickle.dump(results_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# study for signal
for i in [6, 5, 4, 3, 2, 1, 0]:
    print(i)
    df             = pd.read_pickle(input_fp)
    val_results_df = results_list[i]['results_df']
    val_results_df = val_results_df.drop('y', axis=1)
    val_results_df = val_results_df.merge(df,
                                          on=['date'],
                                          how='inner', 
                                          validate='one_to_one')
    val_df = pd.DataFrame(data={'date': val_results_df.date.values,
                                'y': val_results_df.y.values,
                                'y_btc_eth_r_tp2_tp7': val_results_df['y_btc_eth_r_tp2_tp7'].values,
                                'yhat': val_results_df.yhats.values})

    val_df['yhat'] = (val_df.yhat-np.min(val_df.yhat))
    val_df['yhat'] = val_df.yhat/np.max(val_df.yhat)
    for lev in [2, 3, 4, 5]:
        for fee in [1e-4, 2e-4, 5e-4]:
            print(lev)
            print(fee)
            yhat_vals = val_df.yhat.values
            for yhat_val in yhat_vals:
                val_df.loc[val_df.yhat<=yhat_val, 'pw'] = -1
                val_df.loc[val_df.yhat>yhat_val, 'pw'] = 1
                assert(0==val_df.pw.isnull().sum())
                port_ret = np.prod(lev*(1+val_df.pw*val_df.y_btc_eth_r_tp2_tp7)-fee)**(1/len(val_df))-1
                val_df.loc[val_df.yhat==yhat_val, 'ret'] = port_ret

            val_df = val_df.drop('pw', axis=1)

            val_df = val_df.sort_values(by='yhat')
            val_df['yhat'] = np.arange(len(val_df))/len(val_df)
            plt.plot(val_df.yhat, val_df.ret)
            plt.show()

In [None]:
bins = 10
val_df['yhat_decile'] = pd.qcut(val_df.yhat, q=bins, labels=np.arange(bins))
val_df.groupby('yhat_decile')['y_btc_eth_r_tp2_tp7'].mean().plot()

In [None]:
# study feature importance

feat_imprt = np.zeros(df.shape[1]-2)
models = results_list[0]['models']
for model in models:
    feat_imprt += model.feature_importances_
feat_imprt = feat_imprt/(len(models)-1)
print(feat_imprt)

# useful features:
print('useful features:')
threshold = np.median(feat_imprt)
for i in np.where(feat_imprt > threshold)[0]:
    print(df.columns[2:][i])
print('\nuseless features:')
for i in np.where(feat_imprt <= threshold)[0]:
    print(df.columns[2:][i]) 

In [None]:
# form validation period results
val_results_df = results_list[0]['results_df']
val_results_df = val_results_df.drop('y', axis=1)
val_results_df = val_results_df.merge(df,
                                      on=['date'],
                                      how='inner', 
                                      validate='one_to_one')
val_df = pd.DataFrame(data={'date': val_results_df.date.values,
                            'y': val_results_df.y.values,
                            'y_btc_eth_r_tp2_tp7': val_results_df['y_btc_eth_r_tp2_tp7'].values,
                            'yhat': val_results_df.yhats.values})

In [None]:
# Very nice!
val_df['yhat_decile'] = pd.qcut(val_df.yhat, q=10, labels=np.arange(10))
val_df.groupby('yhat_decile')['y_btc_eth_r_tp2_tp7'].mean().plot()

In [None]:
# Explore different trading rules
trading_cost = 0.001

max_return = 0

# loop over leverage
for leverage in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
    for long_threshold in range(10):
        for short_threshold in range(10):
            # grab yhats
            yhats = val_df.yhat.values

            # figure out thresholds
            neg_yhats = yhats[yhats<0]
            pos_yhats = yhats[yhats>=0]
            neg_threshold = pd.qcut(neg_yhats, q=10).categories[short_threshold].right
            pos_threshold = pd.qcut(pos_yhats, q=10).categories[long_threshold].left

            # turn yhats into actual long or short position
            pos = np.piecewise(yhats, [yhats<=neg_threshold, 
                                       (yhats>neg_threshold)&(yhats<=pos_threshold), 
                                       yhats>pos_threshold],
                               [-1, 0, 1])

            # figure out places to charge me
            # by finding indices where previous value is different and the index value is not 0
            trading_fee_array = np.concatenate((np.array([trading_cost]), 
                                               ((pos[1:]!=pos[:-1])&(pos[1:]!=0))*1*trading_cost))

            # calc returns after cost
            returns = pos*val_df.y_btc_eth_r_tp2_tp7*leverage
            returns = returns - trading_fee_array

            # report
            overall_return = np.prod(returns+1)-1
            if overall_return > max_return:
                max_return = overall_return
                
                print('leverage '+str(leverage))
                print('long thres '+str(long_threshold))
                print('short threshold '+str(short_threshold))
                
                print('overal return '+str(np.round(overall_return, 4)))
                start_index = 0
                for i in range(int(2*24*30.5), 10272, int(2*24*30.5)):
                    end_index = i
                    month_returns = returns[start_index:end_index]
                    month_returns = np.prod(month_returns+1)-1

                    start_index = i
                    print('months return ' +str(np.round(month_returns, 4)))

                # report return adjusted by std of negative returns
                std_neg_returns = np.std(returns[returns<0])
                print('adjusted sharpe '+str(np.round(overall_return/std_neg_returns,4)))

                # max dd
                cumulative_ret=(returns+1).cumprod()
                roll_max=cumulative_ret.rolling(len(cumulative_ret), min_periods=1).max()
                dd=cumulative_ret/roll_max
                max_dd=dd.min() - 1
                print('max dd '+str(np.round(max_dd, 4)))

                # sharpe
                sharpe = np.mean(returns)/np.std(returns)*np.sqrt(365.25*24*2)
                print('sharpe '+str(np.round(sharpe, 4)))
                print('\n\n')

In [None]:
# TODO THERE IS MORE TO GRAB FROM v-benchmark_gbdt_btceth_ls.ipynb as needed

In [None]:
# for year in [2016, 2017, 2018, 2019, 2020, 2021]:
#     lasso_df = df[df.date.dt.year == year].copy()

#     # set lasso parameters
#     rhs_feats = list(included_feats)
#     lhs_col   = 'y_btc_eth_diff_r_tp5_tp370'
#     alpha     = 0.001

#     # scale RHS
#     y = lasso_df[lhs_col].values
#     X = lasso_df[rhs_feats].values
#     X_scaled = StandardScaler().fit_transform(X)

#     # fit
#     model = Lasso(alpha=alpha, fit_intercept=False)
#     model.fit(X_scaled, y)

#     # show selected feats
#     lasso_feats = [rhs_feats[i] for i in np.nonzero(model.coef_)[0]]
#     print(year)
#     print(lasso_feats)

In [None]:

# TODO MOVE ANY FUNCTIONS THAT I MAY USE ELSEWHERE TO COMMON FOLDER SHARED ACROSS PROJECTS THAT I JUST IMPORT WHEN NEEDED