In [189]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# IMPORT PACKAGES
from dateutil.relativedelta import relativedelta
from tensorflow.keras.models import Sequential
from keras.callbacks import EarlyStopping
from tensorflow.keras import Model, regularizers
from tensorflow.keras import initializers
from tensorflow.keras.layers import Dense, BatchNormalization, Dot
from keras.models import Model
from typing import Dict, List
from tensorflow import keras
from tools import QuantTools
import tensorflow as tf
import pandas as pd
import numpy as np
import itertools
import random
import pickle
import time
import gc

keras.mixed_precision.set_global_policy("mixed_float16")


In [190]:
def dropRowsColsAndNormalizeForCA(df: pd.DataFrame, lhs_col: str) -> pd.DataFrame:
   # drop 2018-2019 and 2022 data
   # - '18-'19 does not have enough assets
   # - 2022 is oos for now
   df = df[~df.date.dt.year.isin([2018, 2019])].reset_index(drop=True)

   # Set characteristics of interest
   selected_rhs = ['char_addr_new_log_delta_tm2_tm1',
      'char_delta_flow_dist_tm1h',
      'char_exchange_inflow_tm1h',
      'char_exchange_prct_circ_supply_t',
      'char_sent_volume_consumed_tm1',
      'char_r_tm1h',
      'char_r_tm2h',
      'char_r_industry_tm6h',
      'char_var5_tm1',
      'char_var5_tm7',
      'char_shortfall5_tm90',
      'char_vol_tm6h']

   # Cut to characteristics columns of interest
   df = df[['date', 'asset', lhs_col]+selected_rhs]

   # Normalize characteristics to be between 0 and 1.
   for col in selected_rhs:
      df[col] = (df.groupby('date')[col].rank() - 1) / (df.groupby('date')[col].transform('count') - 1)

   # Assert range 
   assert 0 == df[selected_rhs].min().min()
   assert 1 == df[selected_rhs].max().max()
   assert -1 < df[lhs_col].min()
   assert 2 >= df[lhs_col].max()

   # NOTE: keep obs to RHS ratio roughly 4e4:1

   # NOTE: for any macro, take cartesian product with characteristics to make it characteritisc level
   # -or do the reg thing to reduce it down to same dim as number of assets

   return df


In [191]:
def formPortfolioReturnCovariates(df: pd.DataFrame, lhs_col: str) -> pd.DataFrame:
    # Obtain the datetimes of the dataframe
    df = df.sort_values(by = 'date')
    datetimes = np.unique(df.date.values)

    # Form new covariate names
    characteristics = list(df.columns.values)
    characteristics.remove('date')
    characteristics.remove('asset')
    characteristics.remove(lhs_col)
    new_covars = ['x_' + char[5:] for char in characteristics]

    # Loop over all datetimes
    for current_dt in datetimes: 
        # Obtain the datetime's LHS "tomorrow" returns and the covariates
        r_tp1 = df[df.date == current_dt].r_ex_tp1.values
        z_t   = df[df.date == current_dt][characteristics].values
        
        # Calculate the characteristic managed portfolio returns
        design = np.linalg.inv(np.matmul(np.transpose(z_t), z_t))
        x_tp1  = np.matmul(np.matmul(design, np.transpose(z_t)), r_tp1)
        
        # Set the new columns to this week's vector's value
        df.loc[df.date == current_dt, new_covars] = x_tp1

    return df


In [192]:
def buildAutoencoder(b_covars, x_covars, 
    number_hidden_layer, l1_penalty, weight_initializer, bias_initializer, number_factor, learning_rate):
    # Build the betas model from the time t covariates
    model_b = tf.keras.models.Sequential()
    model_b.add(tf.keras.Input(shape=(len(b_covars),)))
    for j in range(number_hidden_layer):
        model_b.add(Dense(16*1/(2**(j)), activation='relu',
                        kernel_regularizer=regularizers.l1(l1=l1_penalty),
                        kernel_initializer=weight_initializer,
                        bias_initializer=bias_initializer))
        model_b.add(BatchNormalization())
    model_b.add(Dense(number_factor, activation='linear',
                    kernel_initializer=weight_initializer,
                    bias_initializer=bias_initializer))

    # Form the x model from time t plus 1 returns
    model_x = tf.keras.models.Sequential()
    model_x.add(tf.keras.Input(shape=(len(x_covars),)))
    model_x.add(Dense(number_factor, activation='linear',
                    kernel_initializer=weight_initializer,
                    bias_initializer=bias_initializer))

    # Form the dot product output for the combination of the two neurals
    mergedOut = Dot(axes=(1,1))([model_b.output, model_x.output])

    # Form the entire model
    model = Model([model_b.input, model_x.input], mergedOut)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                    loss='mean_squared_error',
                    metrics=['mse'])

    return model

def fitAutoencoder(train_df: pd.DataFrame, 
    hps_dict: dict, lhs_col: str='r_ex_tp1', rhs_cols: list=[], 
    val_df: pd.DataFrame=None, return_train_r2: bool=False) -> list:
    # Obtain beta and factor side covariates
    b_covars = [covar for covar in rhs_cols if covar[:4] == 'char']
    x_covars = [covar for covar in rhs_cols if covar[:2] == 'x_']
    assert set(rhs_cols) == (set(b_covars).union(set(x_covars)))

    # Extract the hyperparameters
    number_hidden_layer = hps_dict['num_hidden_layers']
    number_factor       = hps_dict['number_factors']
    learning_rate       = hps_dict['learning_rates']
    l1_penalty          = hps_dict['l1_penalties']
    batch_size          = hps_dict['batch_sizes']
    number_ensemble     = hps_dict['num_ensemble']
    epoch               = hps_dict['epochs']
    early_stopping      = hps_dict['early_stopping']

    # Loop over the ensembles to build models for each
    models = []
    num_epochs_trained = []
    assert(number_ensemble <= 10), 'whatcha think you got infinite come pew ters'
    for i in range(number_ensemble):
        # Obtain the training input and output data and, if passed, validation data
        train_b = train_df[b_covars]
        train_x = train_df[x_covars]  
        train_y = train_df[[lhs_col]]
        if val_df is not None:
            val_b = val_df[b_covars]
            val_x = val_df[x_covars]  
            val_y = val_df[[lhs_col]]

        # According to which model in the ensemble it is, initialize parameters.
        random.seed(i*42)
        initializer_list = [initializers.HeNormal(seed=i), 
                            initializers.GlorotUniform(seed=i), 
                            initializers.RandomUniform(seed=i)]
        initializer_pair = random.sample(initializer_list, 2)
        weight_initializer = initializer_pair[0]
        bias_initializer   = initializer_pair[1]

        # Build the model
        model = buildAutoencoder(b_covars, x_covars, number_hidden_layer, 
            l1_penalty, weight_initializer, bias_initializer, number_factor, learning_rate)

        # Fit the model
        with tf.device('/GPU:0'):
            if early_stopping == True:
                es = EarlyStopping(monitor='val_mse', mode='min', verbose=0, patience = 5) 
                
                model.fit(x=[train_b, train_x], y=train_y, 
                            batch_size=batch_size,
                            validation_data=([val_b, val_x], val_y), 
                            epochs=epoch, verbose=0, callbacks=[es])
                
                num_epochs = es.stopped_epoch
                num_epochs_trained.append(num_epochs)
            else:
                model.fit(x=[train_b, train_x], y=train_y, 
                            batch_size=batch_size,
                            epochs=epoch, verbose=0)

                num_epochs_trained.append(epoch)
        models.append(model)

    if return_train_r2:
        # build the time window for the training data to predict on
        first_datetime = np.min(train_df.date.values)
        month = np.datetime64(first_datetime, 'M').astype(int) % 12 + 1
        year = np.datetime64(first_datetime, 'Y').astype(int) + 1970
        oos_start_date = np.min(train_df[(train_df.date.dt.year==year) 
                            & (train_df.date.dt.month==month+1)].date.values)
        oos_end_date   = np.max(train_df.date.values)

        # fit on the training data for all models to report the r2_pred
        train_yhats = genAutoencoderYhats(models, train_df, rhs_cols, oos_start_date, oos_end_date, number_factor)
        train_ys    = train_df[train_df.date>=oos_start_date][lhs_col].values
        train_r2_pred = 1-(np.mean(np.square(train_ys-train_yhats)))/(np.mean(np.square(train_ys)))
    else:
        train_r2_pred = 0
    
    return models, np.mean(num_epochs_trained), train_r2_pred

def genAutoencoderYhats(models, in_df, rhs_cols, oos_start_date, oos_end_date, number_factor) -> np.array:
    # Obtain beta and factor side covariates
    b_covars = [covar for covar in rhs_cols if covar[:4] == 'char']
    x_covars = [covar for covar in rhs_cols if covar[:2] == 'x_']
    assert set(rhs_cols) == (set(b_covars).union(set(x_covars)))

    # Obtain the oos data
    oos_df = in_df[(in_df.date >= oos_start_date) & (in_df.date <= oos_end_date)].copy()
    oos_b  = oos_df[b_covars]

    # Form each model's results
    b_hats = np.zeros((oos_df.shape[0], number_factor, len(models)))
    lambda_hats = np.zeros((oos_df.shape[0], number_factor, len(models)))
    oos_dates = np.unique(oos_df.date.values)
    for i in range(len(models)):
        # Update the model to use
        model = models[i]

        # Form the beta hats
        layer_name = model.layers[-3]._name  
        assert(model.layers[-3].output_shape[1] == number_factor)
        b_hat_layer = Model(inputs=model.input[0],
                            outputs=model.get_layer(layer_name).output)
        b_hat = b_hat_layer.predict(oos_b, verbose=0) 
        b_hats[:,:,i] = b_hat

        # Form the sample average of the estimated factors up to each oos date
        # build this model's mapping from input to f hat
        model = models[i]
        layer_name = model.layers[-2]._name 
        assert(model.layers[-2].output_shape[1] == number_factor)
        f_hat_layer = Model(inputs=model.input[1],
                            outputs=model.get_layer(layer_name).output)

        # estimate this model's f hats for dates in month before the oos dates
        x = in_df[(in_df.date >= (oos_start_date-np.timedelta64(30, 'D'))) 
                & (in_df.date < oos_start_date)][x_covars]
        f_hat = f_hat_layer.predict(x, verbose=0).astype('float32')
        assert(all(np.isclose(f_hat[0,:], f_hat[1,:])))
        f_hats = np.sum(f_hat, axis=0)

        # obtain the f hats for the entire oos period
        x = in_df[in_df.date >= oos_start_date][x_covars]
        f_hat = f_hat_layer.predict(x, verbose=0).astype('float32')
        assert(all(np.isclose(f_hat[0,:], f_hat[1,:])))

        # determine the lambda hat for each oos date
        lambda_hat_index_start = 0
        for t in range(len(oos_dates)):
            # update this oos date 
            oos_date = oos_dates[t]

            # update the fhats with the appropriate f_hat values
            # -update start index in here so this is skipped on first run but occurs
            #  on every other before we update the end index two lines below
            if t != 0:
                f_hats += np.sum(f_hat[lambda_hat_index_start:lambda_hat_index_end, :], axis=0)
                lambda_hat_index_start = lambda_hat_index_end

            # determine how many obs are in this oos date
            num_rows_in_oos_dt = in_df[in_df.date==oos_date].shape[0]
            
            # update the end index given the number of oos obs for this date
            lambda_hat_index_end = lambda_hat_index_start + num_rows_in_oos_dt

            # divide by total number of f_hats added together to figure out TS average for this oos_date
            #     save as this time period's and this model's lambda hat
            lambda_hats[lambda_hat_index_start:lambda_hat_index_end, :, i] = (
                np.tile(f_hats / in_df[(in_df.date >= (oos_start_date-np.timedelta64(30, 'D'))) 
                                        & (in_df.date<oos_date)].shape[0], 
                        (num_rows_in_oos_dt, 1)))

    # Form model predictions of beta hats times lambda hats where
    #     we take dot product between two factor length vectors for all time periods and models
    #     and then average each model's forecast to return a vector of length of oos dataframe
    yhats = np.mean(np.sum(b_hats * lambda_hats, axis=1), axis=1)

    return yhats


In [193]:
def runCV(df: pd.DataFrame, asset_universe_dict: Dict[str, List],
    val_start_date: str, val_end_date: str, test_start_date: str, lhs_col: str,
    hp_grid: Dict[str, list], periods_in_year: int, 
    cv_out_fp: str, arch_name: str) -> List[dict]:
    # Subset to relevant data
    df = df[df.date < test_start_date].copy()

    # Initialize cv result objects
    results_list = []

    # Determine RHS columns
    rhs_cols = list(df.columns.values)
    rhs_cols.remove('date')
    rhs_cols.remove('asset')
    rhs_cols.remove(lhs_col)

    # Determine validation datetimes to loop over and datetimes to refit at
    val_dts_dict = {}
    val_datetimes = np.unique(df[df.date>=val_start_date].date.values)
    val_sun_midnights = np.unique(df[(df.date>=val_start_date) 
        & (df.date.dt.hour==0) & (df.date.dt.day_of_week==6)].date.values)

    # Check if first val date is sunday midnight, if not then add the dates
    first_val_date = np.min(df[(df.date==val_start_date)].date.values)
    day_of_week_of_first_val_datetime = (first_val_date.astype('datetime64[D]').view('int64') - 4) % 7
    if day_of_week_of_first_val_datetime != 6:
        val_dts_dict[first_val_date] = np.unique(df[(df.date>=first_val_date) & (df.date<val_sun_midnights[0])].date.values)

    # Complete the dictionary with all the sundays as keys as the dates until the next sunday as the values
    for val_sun_midnight in val_sun_midnights:
        next_sun_midnight = val_sun_midnight + np.timedelta64(7, 'D')
        val_dts_dict[val_sun_midnight] = np.unique(df[(df.date>=val_sun_midnight) 
                                            & (df.date<next_sun_midnight)
                                            & (df.date<test_start_date)].date.values)

    # Loop over hp combinations
    keys = hp_grid.keys()
    values = hp_grid.values()
    hp_combos = list(itertools.product(*values))
    for hps in hp_combos:

        # Start the timer
        tic = time.perf_counter()

        # Create hp dictionary and other objects for this iteration
        hps_dict = dict(zip(keys, hps))
        hps_results_dict = hps_dict.copy()
        val_y_yhats_df = pd.DataFrame()

        # Report on progress
        print(hps_dict)

        # Initiate lists for results and start the loop over the val dates to fit and predict
        avg_num_epochs_trained_list = []
        train_r2_pred_list = []
        for val_datetime_start in list(val_dts_dict.keys()): 
            print(val_datetime_start)
            # form end of this window
            val_datetime_end = np.max(val_dts_dict[val_datetime_start])

            # form appropriate asset universe
            first_day_of_month_for_current_val_dt = np.datetime_as_string(val_datetime_start, unit='M')+'-01'
            asset_universe = asset_universe_dict[first_day_of_month_for_current_val_dt]

            # form relevant date-assets given asset universe and the train and val dataframes
            rel_df = df[(df.date<=val_datetime_end) & (df.asset.isin(asset_universe))].copy()
            train_df = rel_df[rel_df.date<val_datetime_start].copy()
            val_df = rel_df[(rel_df.date>=val_datetime_start) & (rel_df.date<=val_datetime_end)].copy()

            # fit and predict
            models, avg_num_epochs_trained, train_r2_pred = fitAutoencoder(
                train_df, hps_dict, lhs_col, rhs_cols, val_df, return_train_r2=False)
            yhats = genAutoencoderYhats(models, rel_df, rhs_cols,
                val_datetime_start, val_datetime_end, hps_dict['number_factors'])
            del rel_df, models
            gc.collect()

            # save the results
            avg_num_epochs_trained_list.append(avg_num_epochs_trained)
            train_r2_pred_list.append(train_r2_pred)
            temp_yhats_df = val_df[['date', 'asset', lhs_col]].copy()
            temp_yhats_df['yhats'] = yhats
            val_y_yhats_df = pd.concat([val_y_yhats_df, temp_yhats_df])

            # output this week r2_pred and return
            val_week_df = val_y_yhats_df[(val_y_yhats_df.date>=val_datetime_start) 
                                        & (val_y_yhats_df.date<=val_datetime_end)].copy()
            val_week_df = val_week_df.sort_values(by=['date', 'asset'], ignore_index=True)
            val_week_df = QuantTools.formPortfolioPositionsQuantileLongShort(val_week_df, 5)
            val_week_df['returns'] = val_week_df.position*val_week_df[lhs_col]
            val_week_y = val_week_df[lhs_col].values
            val_week_yhats = val_week_df['yhats'].values
            val_week_returns = val_week_df[val_week_df.position !=0].groupby('date')['returns'].sum().values
            val_week_r_2_pred = 1-np.mean(np.square(val_week_y - val_week_yhats))/np.mean(np.square(val_week_y))
            print(f'this week r 2 pred: {val_week_r_2_pred}')
            print(f'this week geom avg ret {QuantTools.calcGeomAvg(val_week_returns)}')

        # Stop the timer
        toc = time.perf_counter()

        # For this hp, add metadata to results dict
        hps_results_dict['avg_epochs_trained'] = np.mean(avg_num_epochs_trained_list)
        hps_results_dict['val_start_date'] = val_start_date
        hps_results_dict['val_end_date'] = val_end_date
        hps_results_dict['arch_name'] = arch_name
        hps_results_dict['runtime'] = round((toc - tic)/60, 0) 

        # For this hp, add training period statistics
        hps_results_dict['train_r2_pred_min'] = np.min(train_r2_pred_list)
        hps_results_dict['train_r2_pred_mean'] = np.mean(train_r2_pred_list)
        hps_results_dict['train_r2_pred_max'] = np.max(train_r2_pred_list)

        # For this hp, obtain the yhats and ys and positions
        val_y_yhats_df = val_y_yhats_df.dropna()
        val_y_yhats_df = val_y_yhats_df.sort_values(by=['date', 'asset'], ignore_index=True)
        val_y_yhats_pos_df = QuantTools.formPortfolioPositionsQuantileLongShort(val_y_yhats_df, 5)
        val_yhats = val_y_yhats_pos_df.yhats.values
        val_ys    = val_y_yhats_pos_df[lhs_col].values
        val_y_yhats_pos_df['returns'] = val_y_yhats_pos_df.position*val_y_yhats_pos_df[lhs_col]
        returns = val_y_yhats_pos_df[val_y_yhats_pos_df.position != 0].groupby('date')['returns'].sum().values
        assert len(val_yhats) == len(val_ys)

        # For this hp, form validation period statistics
        hps_results_dict['val_mse']       = np.mean(np.square(val_ys-val_yhats))
        hps_results_dict['val_r2_pred']   = 1-np.mean(np.square(val_ys-val_yhats))/np.mean(np.square(val_ys))
        hps_results_dict['val_yhat_min']  = np.min(val_yhats)
        hps_results_dict['val_yhat_q1']   = np.quantile(val_yhats, q=0.25)
        hps_results_dict['val_yhat_q2']   = np.quantile(val_yhats, q=0.5)
        hps_results_dict['val_yhat_mean'] = np.mean(val_yhats)
        hps_results_dict['val_yhat_q3']   = np.quantile(val_yhats, q=0.75)
        hps_results_dict['val_yhat_max']  = np.max(val_yhats)
        hps_results_dict['geom_mean_1h']  = QuantTools.calcGeomAvg(returns)
        hps_results_dict['sharpe_annual'] = QuantTools.calcSharpe(returns, periods_in_year=periods_in_year)
        hps_results_dict['sd_annual']     = QuantTools.calcSD(returns, periods_in_year=periods_in_year)
        hps_results_dict['max_dd']        = QuantTools.calcMaxDrawdown(returns)
        hps_results_dict['avg_turnover']  = QuantTools.calcTSAvgTurnover(val_y_yhats_pos_df)

        # Save results to return
        results_list.append(hps_results_dict)

        # For this hp, save results to csv
        cv_df = pd.DataFrame(results_list)
        timestr = time.strftime("%Y%m%d_%H%M%S")
        fp = cv_out_fp + '-' + arch_name + '-' + timestr + '.csv'
        cv_df.to_csv(fp, index=False)

    # Return cv results
    return results_list


In [194]:
def predictTestPeriod(df: pd.DataFrame, asset_universe_dict: Dict[str, list],
    test_start_date: str, lhs_col: str, opt_hps_dict: dict) -> pd.DataFrame:
    # Confirm separate df formed
    df = df.copy()

    # Determine RHS columns
    rhs_cols = list(df.columns.values)
    rhs_cols.remove('date')
    rhs_cols.remove('asset')
    rhs_cols.remove(lhs_col)

    # Determine test period datetimes to loop over and datetimes to refit at
    test_dts_dict = {}
    test_sun_midnights = np.unique(df[(df.date>=test_start_date) 
        & (df.date.dt.hour==0) & (df.date.dt.day_of_week==6)].date.values)

    # Check if first test date is sunday midnight, if not then add the dates
    first_test_datetime = np.min(df[(df.date==test_start_date)].date.values)
    day_of_week_of_first_test_datetime = (first_test_datetime.astype('datetime64[D]').view('int64') - 4) % 7
    if day_of_week_of_first_test_datetime != 6:
        test_dts_dict[first_test_datetime] = np.unique(df[(df.date>=first_test_datetime) & (df.date<test_sun_midnights[0])].date.values)

    # Complete the dictionary with all the sundays as keys as the dates until the next sunday as the testues
    for test_sun_midnight in test_sun_midnights:
        next_sun_midnight = test_sun_midnight + np.timedelta64(7, 'D')
        test_dts_dict[test_sun_midnight] = np.unique(df[(df.date>=test_sun_midnight) 
                                            & (df.date<next_sun_midnight)].date.values)
        
    # Create dataframe of results to return
    test_y_yhats_df = pd.DataFrame()

    # Loop over all the datetimes in the test period where we want to refit the model
    for test_datetime_start in list(test_dts_dict.keys()):
        # Monitor progress
        print('Currently fitting and predicting for the week starting: ')
        print(test_datetime_start)

        # Form end of this window
        test_datetime_end = np.max(test_dts_dict[test_datetime_start])

        # Form appropriate asset universe
        first_day_of_month_for_current_test_dt = np.datetime_as_string(test_datetime_start, unit='M')+'-01'
        asset_universe = asset_universe_dict[first_day_of_month_for_current_test_dt]

        # From relevant date-assets given asset universe and the train and oos dataframes
        rel_df   = df[(df.date<=test_datetime_end) & (df.asset.isin(asset_universe))].copy()
        train_df = rel_df[rel_df.date<test_datetime_start].copy()
        oos_df   = rel_df[(rel_df.date>=test_datetime_start) & (rel_df.date<=test_datetime_end)].copy()

        # Fit and predict
        models, _, _ = fitAutoencoder(train_df, opt_hps_dict, lhs_col, rhs_cols)
        yhats = genAutoencoderYhats(models, rel_df, rhs_cols, test_datetime_start, test_datetime_end, opt_hps_dict['number_factors'])
        del rel_df, train_df
        gc.collect()
        temp_yhats_df = oos_df[['date', 'asset', lhs_col]].copy()
        temp_yhats_df['yhats'] = yhats
        test_y_yhats_df = pd.concat([test_y_yhats_df, temp_yhats_df])

    return test_y_yhats_df


In [None]:
if __name__ == "__main__":
    # set args
    TRAIN_IN_FP     = '../data/clean/panel_train.pkl'
    TEST_IN_FP      = '../data/clean/panel_test.pkl'
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    CV_OUT_FP       = '../output/high_dim_fm/cv_results'
    TEST_OUT_FP     = '../data/clean/test_yhats_autoencoder.pkl'
    LHS_COL         = 'r_ex_tp1'
    VAL_START_DATE  = '2021-01-01'
    VAL_END_DATE    = '2021-06-30'
    TEST_START_DATE = '2021-07-01'
    NUM_CPUS        = 22 
    PERIODS_IN_YEAR = 365*24
    ARCH_NAME       = 'autoencoder-12rhs'
    HP_GRID         = {'number_factors': [1],
        'num_hidden_layers': [1],
        'learning_rates': [5e-5], 
        'batch_sizes': [2048],
        'l1_penalties': [1e-4],
        'num_ensemble': [10],
        'early_stopping': [True],
        'epochs': [100]} 

    # read in data
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    all_df = pd.read_pickle(TRAIN_IN_FP)
    test_df = pd.read_pickle(TEST_IN_FP)
    all_df = pd.concat([all_df, test_df])
    del test_df
    gc.collect()

    # drop rows and columns such that data will work for conditional autoencoder (CA)
    all_df = dropRowsColsAndNormalizeForCA(all_df, LHS_COL) # NOTE: ~6 min RUNTIME

    # form the char-sorted portfolios for factor side of CA input
    all_df = formPortfolioReturnCovariates(all_df, LHS_COL) # NOTE: ~12 min runtime

    # run CV
    cv_results_list = runCV(all_df, asset_universe_dict,
        VAL_START_DATE, VAL_END_DATE, TEST_START_DATE,
        LHS_COL, HP_GRID, PERIODS_IN_YEAR, CV_OUT_FP, ARCH_NAME)

    # choose optimal hyperparameter combination based on validation period predictive R^2
    opt_val_r2_pred = -1e6
    for cv_result in cv_results_list:
        if cv_result['val_r2_pred'] > opt_val_r2_pred:
            opt_val_r2_pred = cv_result['val_r2_pred']
            opt_cv_result = cv_result
    opt_hps_dict = dict(itertools.islice(opt_cv_result.items(), len(HP_GRID)+1))
    opt_hps_dict['epochs'] = int(opt_hps_dict['avg_epochs_trained'])
    opt_hps_dict['early_stopping'] = False
    del opt_hps_dict['avg_epochs_trained']

    # this was the best in first half 2022
    opt_hps_dict = {'number_factors': 1,
        'num_hidden_layers': 1,
        'learning_rates': 5e-5, 
        'batch_sizes': 2048,
        'l1_penalties': 1e-4,
        'num_ensemble': 10,
        'early_stopping': False,
        'epochs': 38}

    # predict in test period with the optimal model
    test_y_yhats_df = predictTestPeriod(all_df, asset_universe_dict, TEST_START_DATE, LHS_COL, opt_hps_dict)
    test_y_yhats_df.to_pickle(TEST_OUT_FP)

    # TODO ALSO NEED TO TRY A MODEL WHERE WE SET MISSING ASSETS LHS 
    # AND RHS TO -2 AND THEN WE HAVE BALANCED PANEL SO WE FIT ON THE 
    # MATRIX OF RHS ACROSS ASSETS AS OPPOSED OT INDIVIDUAL DATE-ASSET 
    # AND THEN THE NETWORK LEARNS ASSET-SPECIFIC PARAMETERS


In [16]:
# NOTE:

# Autoencoder parameter count is 2e2 to 3e3.

# Transaction costs: 6.6 bps per hour (i.e. turnover (e.g. 16%) on 41 bps).
# -If ventiles, then we are turning over about 16% based on autoencoder results across 2h 2021 thru 2022.
# -Trading $2k per asset per hour then this is $64k an hour or $46MM per month total volume.
# -Taker fees at that level for $COIN are 16 bps, which on 16% of the portfolio is 2.6 bps.
# -Bid ask spread is something like 50 bps so 25 bps on 16% is 4 bps. 
# -Makes 6.6 bps total per hour.
# -Could add leverage at 2 bps to open and .5 bps every hour so call it 2.5 bps per position per hour, 
#      which would make the results highly tasty.

# Benchmark linear regression in 2021 2h: 5 bps r2pred; 10 bps tertile spread.
# Benchmark single factor: 15 bps.

# For positive r2pred, need:
# -in 2021 1h: val mse's 2e-4 to 5e-4

# For boot strap stat sig:
# -need geom return above 1 bps, easy.
# -need mse improvement over zero to be 2e-3. seems basically impossible given noise level.

In [None]:
# VENTILE RESULTS:

# -2022 2h:
# -- -1.47
# --8.2 bps over a 6.6 bps cost
# --16.8 sharpe after taking out 6.6 bps cost

# money printer go brrr?