In [1]:
from dateutil.relativedelta import relativedelta
from tensorflow.keras.models import Sequential
from keras.callbacks import EarlyStopping
from tensorflow.keras import Model, regularizers
from tensorflow.keras import initializers
from tensorflow.keras.layers import Dense, BatchNormalization, Dot
from joblib import Parallel, delayed
from keras.models import Model
from datetime import datetime
from typing import Dict, List
from tensorflow import keras
import scipy.stats as stats
import tensorflow as tf
import sklearn as sk
import pandas as pd
import numpy as np
import itertools
import random
import pickle
import time
import gc

keras.mixed_precision.set_global_policy("mixed_float16")


2023-06-21 18:56:58.281497: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-21 18:56:58.342297: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3070 Ti, compute capability 8.6


In [2]:
def calcGeomAvg(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the geometric average of a vector of simple returns.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar geometric average.
    """
    if not isinstance(returns, np.ndarray):
        raise TypeError("Input 'returns' must be a NumPy array")
    if annualized and periods_in_year is None:
        raise ValueError("Input 'periods_in_year' must be provided if 'annualized' is True")
    geom_avg_at_given_freq = np.prod(1 + returns) ** (1 / np.size(returns)) - 1
    return (geom_avg_at_given_freq + 1) ** periods_in_year - 1 if annualized else geom_avg_at_given_freq

def calcTSAvgReturn(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float:
    """ Calculate the time series mean return of a vector of simple returns with option to annualize.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar time series mean return.
    """
    mean_ret_at_given_freq = np.mean(returns)
    if annualized == False:
        return mean_ret_at_given_freq
    else:
        mean_ret = periods_in_year*mean_ret_at_given_freq
        if mean_ret < -1:
            return -1.
        else:
            return mean_ret

def calcSD(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the standard deviation of a vector of simple returns with option to annualize.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar standard deviation.
    """
    sd_at_given_freq = np.std(returns)
    if annualized==False:
        return sd_at_given_freq
    else:
        return np.sqrt(periods_in_year)*sd_at_given_freq

def calcSharpe(returns: np.array,
    periods_in_year: int,
    risk_free_returns: np.array=None) -> float:
    """ Calculate the annual Sharpe Ratio of a vector of simple returns. 

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        periods_in_year (int): how many periods of the given frequency are in a year.
        risk_free_returns (np.array): vector of simple returns of the risk free rate.

    Returns:
        (float): scalar standard deviation.
    """
    if risk_free_returns is not None:
        returns = returns - risk_free_returns
    
    return (calcTSAvgReturn(returns, annualized=True, periods_in_year=periods_in_year) /
            calcSD(returns, annualized=True, periods_in_year=periods_in_year))

def calcMaxDrawdown(returns: np.array) -> float:
    ''' calculate maximum drawdown for a vector of returns of any frequency.
    
    Args:
        returns (np.array): vector of simple returns.
    
    Returns:
        max_drawdown (float): maximum drawdown in simple return units over this period.
    '''
    # calculate the cumulative return as a new vector of the same length
    cumulative_ret=(returns+1).cumprod()

    # for every period, calc the historic maximum value of the portfolio 
    roll_max=pd.Series(cumulative_ret).rolling(len(cumulative_ret), min_periods=1).max()

    # calc drawdown as the current portfolio value divided by the historic max value
    dd=np.min(cumulative_ret/roll_max)
    
    # return simple return of max drawdown
    return dd-1

def formPortfolioPositionsQuantileLongShort(df: pd.DataFrame, quantile: int) -> pd.DataFrame:
    """
    This function takes a pandas DataFrame with columns "date", "asset", and "yhats" and performs
    the specified steps to form a new column "position" containing portfolio allocation percentages
    that sum to 0 within date where we long the top quantile and short the bottom quantile.
    The function then returns the modified DataFrame with the four columns.
    
    Args:
        df (pd.DataFrame): A pandas DataFrame containing columns "date", "asset", and "yhats".
    
    Returns:
        pd.DataFrame: The modified DataFrame with an additional "position" column.
    """
    # Randomly sort the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)

    # Sort the DataFrame by date and yhats
    df = df.sort_values(by=['date', 'yhats'])

    # Add a small random noise to yhats to handle potential duplicated values
    df['yhats_noisy'] = df['yhats'] + np.random.uniform(-1e-10, 1e-10, size=len(df))

    # Calculate quantiles
    quantiles = df.groupby('date')['yhats_noisy'].transform(lambda x: pd.qcut(x, quantile, labels=False))

    # Assign positions
    df['position'] = np.where(quantiles == 0, -1, np.where(quantiles == quantile-1, 1, 0))

    # Divide the positions column by how many assets are in each date
    df['position'] = df.groupby(['date', 'position'])['position'].transform(lambda x: x / x.count())

    # Check that the position column sums to 1 within each date
    position_sums = df.groupby('date')['position'].sum()
    assert all(np.isclose(position_sums, 0)), f"Position column sums do not equal 1 for all dates: {position_sums[np.isclose(position_sums, 0)]}"

    # Drop the noisy yhats column
    df = df.drop(columns=['yhats_noisy'])

    # Sort the DataFrame by date and asset
    df = df.sort_values(by=['date', 'asset'])

    return df

def calcTSAvgTurnover(df: pd.DataFrame) -> float:
    """
    This function takes a pandas DataFrame with columns "date", "asset", and "position" and
    calculates the average turnover, which is defined as the time series average of the percentage
    of assets each date that do not have the same position (-1, 0, 1) as the previous date for that asset.
    
    Args:
        df (pd.DataFrame): A pandas DataFrame containing columns "date", "asset", and "position".
    
    Returns:
        float: The average turnover.
    """
    
    # Sort the DataFrame by date and asset
    df = df.sort_values(by=['date', 'asset'])

    # Shift the position column to get the previous position for each asset
    df['prev_position'] = df.groupby('asset')['position'].shift(1)

    # Calculate the percentage of assets each date that do not have the same position as the previous date
    df['position_changed'] = np.where(df['position'] != df['prev_position'], 1, 0)
    turnover_pct = df.groupby('date')['position_changed'].mean()

    # Calculate the time series average of the percentage of assets with changed positions
    average_turnover = turnover_pct.mean()

    return average_turnover


In [3]:
def dropRowsAndColsForCA(df: pd.DataFrame, lhs_col: str) -> pd.DataFrame:
   # drop 2018-2019 and 2022 data
   # - '18-'19 does not have enough assets
   # - 2022 is oos for now
   df = df[~df.date.dt.year.isin([2018, 2019, 2022])].reset_index(drop=True)

   # Set characteristics of interest
   selected_rhs = ['char_addr_new_log_delta_tm2_tm1',
      'char_alpha_tm7',
      'char_beta_tm7',
      "char_delta_flow_dist_tm1h",
      "char_exchange_prct_circ_supply_t",
      "char_illiq_tm7",
      "char_r_ath_t",
      "char_r_tm14",
      "char_r_tm60",
      "char_rank_cmc_t",
      "char_sent_volume_consumed_tm1",
      "char_vol_tm6h"]

   # Cut to characteristics columns of interest
   df = df[['date', 'asset', lhs_col]+selected_rhs]

   # Note: keep obs to RHS ratio roughly 4e4:1

   # Note: for any macro, take cartesian product with characteristics to make it characteritisc level
   # -or do the reg thing to reduce it down to same dim as number of assets

   return df

In [4]:
def formPortfolioReturnCovariates(df: pd.DataFrame, lhs_col: str) -> pd.DataFrame:
    # Obtain the datetimes of the dataframe
    df = df.sort_values(by = 'date')
    datetimes = np.unique(df.date.values)

    # Form new covariate names
    characteristics = list(df.columns.values)
    characteristics.remove('date')
    characteristics.remove('asset')
    characteristics.remove(lhs_col)
    new_covars = ['x_' + char[5:] for char in characteristics]

    # Loop over all datetimes
    for current_dt in datetimes: 
        # Obtain the datetime's LHS "tomorrow" returns and the covariates
        r_tp1 = df[df.date == current_dt].r_ex_tp1.values
        z_t   = df[df.date == current_dt][characteristics].values
        
        # Calculate the characteristic managed portfolio returns
        design = np.linalg.inv(np.matmul(np.transpose(z_t), z_t))
        x_tp1  = np.matmul(np.matmul(design, np.transpose(z_t)), r_tp1)
        
        # Set the new columns to this week's vector's value
        df.loc[df.date == current_dt, new_covars] = x_tp1

    return df


In [5]:
def buildAutoencoder(b_covars, x_covars, 
    number_hidden_layer, l1_penalty, weight_initializer, bias_initializer, number_factor, learning_rate):
    # Build the betas model from the time t covariates
    model_b = tf.keras.models.Sequential()
    model_b.add(tf.keras.Input(shape=(len(b_covars),)))
    for j in range(number_hidden_layer):
        model_b.add(Dense(16*1/(2**(j)), activation='relu',
                        kernel_regularizer=regularizers.l1(l1=l1_penalty),
                        kernel_initializer=weight_initializer,
                        bias_initializer=bias_initializer))
        model_b.add(BatchNormalization())
    model_b.add(Dense(number_factor, activation='linear',
                    kernel_initializer=weight_initializer,
                    bias_initializer=bias_initializer))

    # Form the x model from time t plus 1 returns
    model_x = tf.keras.models.Sequential()
    model_x.add(tf.keras.Input(shape=(len(x_covars),)))
    model_x.add(Dense(number_factor, activation='linear',
                    kernel_initializer=weight_initializer,
                    bias_initializer=bias_initializer))

    # Form the dot product output for the combination of the two neurals
    mergedOut = Dot(axes=(1,1))([model_b.output, model_x.output])

    # Form the entire model
    model = Model([model_b.input, model_x.input], mergedOut)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                    loss='mean_squared_error',
                    metrics=['mse'])

    return model

def fitAutoencoder(train_df: pd.DataFrame, 
    hps_dict: dict, lhs_col: str='r_ex_tp1', rhs_cols: list=[], 
    val_df: pd.DataFrame=None, return_train_r2: bool=False) -> list:
    # Obtain beta and factor side covariates
    b_covars = [covar for covar in rhs_cols if covar[:4] == 'char']
    x_covars = [covar for covar in rhs_cols if covar[:2] == 'x_']
    assert set(rhs_cols) == (set(b_covars).union(set(x_covars)))

    # Extract the hyperparameters
    number_hidden_layer = hps_dict['num_hidden_layers']
    number_factor       = hps_dict['number_factors']
    learning_rate       = hps_dict['learning_rates']
    l1_penalty          = hps_dict['l1_penalties']
    batch_size          = hps_dict['batch_sizes']
    number_ensemble     = hps_dict['num_ensemble']
    epoch               = hps_dict['epochs']
    early_stopping      = hps_dict['early_stopping']

    # Loop over the ensembles to build models for each
    models = []
    num_epochs_trained = []
    assert(number_ensemble <= 10), 'whatcha think you got infinite come pew ters'
    for i in range(number_ensemble):
        # Obtain the training input and output data and, if passed, validation data
        train_b = train_df[b_covars]
        train_x = train_df[x_covars]  
        train_y = train_df[[lhs_col]]
        if val_df is not None:
            val_b = val_df[b_covars]
            val_x = val_df[x_covars]  
            val_y = val_df[[lhs_col]]

        # According to which model in the ensemble it is, initialize parameters.
        random.seed(i*42)
        initializer_list = [initializers.HeNormal(seed=i), 
                            initializers.GlorotUniform(seed=i), 
                            initializers.RandomUniform(seed=i)]
        initializer_pair = random.sample(initializer_list, 2)
        weight_initializer = initializer_pair[0]
        bias_initializer   = initializer_pair[1]

        # Build the model
        model = buildAutoencoder(b_covars, x_covars, number_hidden_layer, 
            l1_penalty, weight_initializer, bias_initializer, number_factor, learning_rate)

        # Fit the model
        with tf.device('/GPU:0'):
            if early_stopping == True:
                es = EarlyStopping(monitor='val_mse', mode='min', verbose=0, patience = 5) 
                
                model.fit(x=[train_b, train_x], y=train_y, 
                            batch_size=batch_size,
                            validation_data=([val_b, val_x], val_y), 
                            epochs=epoch, verbose=1, callbacks=[es]) 
                
                num_epochs = es.stopped_epoch
                num_epochs_trained.append(num_epochs)
            else:
                model.fit(x=[train_b, train_x], y=train_y, 
                            batch_size=batch_size,
                            epochs=epoch, verbose=1)

                num_epochs_trained.append(epoch)
        
        models.append(model)

    if return_train_r2:
        # build the time window for the training data to predict on
        first_datetime = np.min(train_df.date.values)
        month = np.datetime64(first_datetime, 'M').astype(int) % 12 + 1
        year = np.datetime64(first_datetime, 'Y').astype(int) + 1970
        oos_start_date = np.min(train_df[(train_df.date.dt.year==year) 
                            & (train_df.date.dt.month==month+1)].date.values)
        oos_end_date   = np.max(train_df.date.values)

        # fit on the training data for all models to report the r2_pred
        train_yhats = genAutoencoderYhats(models, train_df, rhs_cols, oos_start_date, oos_end_date, number_factor)
        train_ys    = train_df[train_df.date>=oos_start_date][lhs_col].values
        train_r2_pred = 1-(np.mean(np.square(train_ys-train_yhats)))/(np.mean(np.square(train_ys)))
    else:
        train_r2_pred = 0
    
    return models, np.mean(num_epochs_trained), train_r2_pred

def genAutoencoderYhats(models, in_df, rhs_cols, oos_start_date, oos_end_date, number_factor) -> np.array:
    # Obtain beta and factor side covariates
    b_covars = [covar for covar in rhs_cols if covar[:4] == 'char']
    x_covars = [covar for covar in rhs_cols if covar[:2] == 'x_']
    assert set(rhs_cols) == (set(b_covars).union(set(x_covars)))

    # Obtain the oos data
    oos_df = in_df[(in_df.date >= oos_start_date) & (in_df.date <= oos_end_date)].copy()
    oos_b  = oos_df[b_covars]

    # Form each model's results
    b_hats = np.zeros((oos_df.shape[0], number_factor, len(models)))
    lambda_hats = np.zeros((oos_df.shape[0], number_factor, len(models)))
    oos_dates = np.unique(oos_df.date.values)
    for i in range(len(models)):
        # Update the model to use
        model = models[i]

        # Form the beta hats
        layer_name = model.layers[-3]._name  
        assert(model.layers[-3].output_shape[1] == number_factor)
        b_hat_layer = Model(inputs=model.input[0],
                            outputs=model.get_layer(layer_name).output)
        b_hat = b_hat_layer.predict(oos_b, verbose=0)
        b_hats[:,:,i] = b_hat

        # Form the sample average of the estimated factors up to each oos date
        # build this model's mapping from input to f hat
        model = models[i]
        layer_name = model.layers[-2]._name 
        assert(model.layers[-2].output_shape[1] == number_factor)
        f_hat_layer = Model(inputs=model.input[1],
                            outputs=model.get_layer(layer_name).output)

        # estimate this model's f hats for dates in month before the oos dates
        x = in_df[(in_df.date >= (oos_start_date-np.timedelta64(30, 'D'))) 
                & (in_df.date < oos_start_date)][x_covars]
        f_hat = f_hat_layer.predict(x, verbose=0).astype('float32')
        assert(all(np.isclose(f_hat[0,:], f_hat[1,:])))
        f_hats = np.sum(f_hat, axis=0)

        # obtain the f hats for the entire oos period
        x = in_df[in_df.date >= oos_start_date][x_covars]
        f_hat = f_hat_layer.predict(x, verbose=0).astype('float32')
        assert(all(np.isclose(f_hat[0,:], f_hat[1,:])))

        # determine the lambda hat for each oos date
        lambda_hat_index_start = 0
        for t in range(len(oos_dates)):
            # update this oos date 
            oos_date = oos_dates[t]

            # update the fhats with the appropriate f_hat values
            # -update start index in here so this is skipped on first run but occurs
            #  on every other before we update the end index two lines below
            if t != 0:
                f_hats += np.sum(f_hat[lambda_hat_index_start:lambda_hat_index_end, :], axis=0)
                lambda_hat_index_start = lambda_hat_index_end

            # determine how many obs are in this oos date
            num_rows_in_oos_dt = in_df[in_df.date==oos_date].shape[0]
            
            # update the end index given the number of oos obs for this date
            lambda_hat_index_end = lambda_hat_index_start + num_rows_in_oos_dt

            # divide by total number of f_hats added together to figure out TS average for this oos_date
            #     save as this time period's and this model's lambda hat
            lambda_hats[lambda_hat_index_start:lambda_hat_index_end, :, i] = (
                np.tile(f_hats / in_df[(in_df.date >= (oos_start_date-np.timedelta64(30, 'D'))) 
                                        & (in_df.date<oos_date)].shape[0], 
                        (num_rows_in_oos_dt, 1)))

    # Form model predictions of beta hats times lambda hats where
    #     we take dot product between two factor length vectors for all time periods and models
    #     and then average each model's forecast to return a vector of length of oos dataframe
    yhats = np.mean(np.sum(b_hats * lambda_hats, axis=1), axis=1)

    return yhats


In [6]:
def runCV(df: pd.DataFrame, asset_universe_dict: Dict[str, List],
    val_start_date: str, val_end_date: str, test_start_date: str, lhs_col: str,
    hp_grid: Dict[str, list], periods_in_year: int, 
    cv_out_fp: str, arch_name: str) -> List[dict]:
    # Subset to relevant data
    df = df[df.date < test_start_date].copy()

    # Initialize cv result objects
    results_list = []

    # Determine RHS columns
    rhs_cols = list(df.columns.values)
    rhs_cols.remove('date')
    rhs_cols.remove('asset')
    rhs_cols.remove(lhs_col)

    # Determine validation datetimes to loop over and datetimes to refit at
    val_dts_dict = {}
    val_datetimes = np.unique(df[df.date>=val_start_date].date.values)
    val_sun_midnights = np.unique(df[(df.date>=val_start_date) 
        & (df.date.dt.hour==0) & (df.date.dt.day_of_week==6)].date.values)

    # Check if first val date is sunday midnight, if not then add the dates
    first_val_date = np.min(df[(df.date==val_start_date)].date.values)
    day_of_week_of_first_val_datetime = (first_val_date.astype('datetime64[D]').view('int64') - 4) % 7
    if day_of_week_of_first_val_datetime != 6:
        val_dts_dict[first_val_date] = np.unique(df[(df.date>=first_val_date) & (df.date<val_sun_midnights[0])].date.values)

    # Complete the dictionary with all the sundays as keys as the dates until the next sunday as the values
    for val_sun_midnight in val_sun_midnights:
        next_sun_midnight = val_sun_midnight + np.timedelta64(7, 'D')
        val_dts_dict[val_sun_midnight] = np.unique(df[(df.date>=val_sun_midnight) 
                                            & (df.date<next_sun_midnight)
                                            & (df.date<test_start_date)].date.values)

    # Loop over hp combinations
    keys = hp_grid.keys()
    values = hp_grid.values()
    hp_combos = list(itertools.product(*values))
    for hps in hp_combos:

        # Start the timer
        tic = time.perf_counter()

        # Create hp dictionary and other objects for this iteration
        hps_dict = dict(zip(keys, hps))
        hps_results_dict = hps_dict.copy()
        val_y_yhats_df = pd.DataFrame()

        # Report on progress
        print(hps_dict)

        # Define function to loop over
        avg_num_epochs_trained_list = []
        train_r2_pred_list = []

        for val_datetime_start in list(val_dts_dict.keys()): 
            print(val_datetime_start)
            # form end of this window
            val_datetime_end = np.max(val_dts_dict[val_datetime_start])

            # form appropriate asset universe
            first_day_of_month_for_current_val_dt = np.datetime_as_string(val_datetime_start, unit='M')+'-01'
            asset_universe = asset_universe_dict[first_day_of_month_for_current_val_dt]

            # form relevant date-assets given asset universe and the train and val dataframes
            rel_df = df[(df.date<=val_datetime_end) & (df.asset.isin(asset_universe))].copy()
            train_df = rel_df[rel_df.date<val_datetime_start].copy()
            val_df = rel_df[(rel_df.date>=val_datetime_start) & (rel_df.date<=val_datetime_end)].copy()

            # fit and predict
            models, avg_num_epochs_trained, train_r2_pred = fitAutoencoder(
                train_df, hps_dict, lhs_col, rhs_cols, val_df, return_train_r2=False)
            yhats = genAutoencoderYhats(models, rel_df, rhs_cols,
                val_datetime_start, val_datetime_end, hps_dict['number_factors'])
            del rel_df, models
            gc.collect()

            # save the results
            avg_num_epochs_trained_list.append(avg_num_epochs_trained)
            train_r2_pred_list.append(train_r2_pred)
            temp_yhats_df = val_df[['date', 'asset', lhs_col]].copy()
            temp_yhats_df['yhats'] = yhats
            val_y_yhats_df = pd.concat([val_y_yhats_df, temp_yhats_df])

            # output this week r2_pred and return
            val_week_df = val_y_yhats_df[(val_y_yhats_df.date>=val_datetime_start) 
                                        & (val_y_yhats_df.date<=val_datetime_end)].copy()
            val_week_df = val_week_df.sort_values(by=['date', 'asset'], ignore_index=True)
            val_week_df = formPortfolioPositionsQuantileLongShort(val_week_df, 5)
            val_week_df['returns'] = val_week_df.position*val_week_df[lhs_col]
            val_week_y = val_week_df[lhs_col].values
            val_week_yhats = val_week_df['yhats'].values
            val_week_returns = val_week_df[val_week_df.position !=0].groupby('date')['returns'].sum().values
            print('this week r 2 pred:')
            print(1-np.mean(val_week_y - val_week_yhats)/np.mean(val_week_y))
            print(f'this week geom avg ret {calcGeomAvg(val_week_returns)}')

        # Stop the timer
        toc = time.perf_counter()

        # For this hp, add metadata to results dict
        hps_results_dict['avg_epochs_trained'] = np.mean(avg_num_epochs_trained_list)
        hps_results_dict['val_start_date'] = val_start_date
        hps_results_dict['val_end_date'] = val_end_date
        hps_results_dict['arch_name'] = arch_name
        hps_results_dict['runtime'] = round((toc - tic)/60, 0) 

        # For this hp, add training period statistics
        hps_results_dict['train_r2_pred_min'] = np.min(train_r2_pred_list)
        hps_results_dict['train_r2_pred_mean'] = np.mean(train_r2_pred_list)
        hps_results_dict['train_r2_pred_max'] = np.max(train_r2_pred_list)

        # For this hp, obtain the yhats and ys and positions
        val_y_yhats_df = val_y_yhats_df.dropna()
        val_y_yhats_df = val_y_yhats_df.sort_values(by=['date', 'asset'], ignore_index=True)
        val_y_yhats_pos_df = formPortfolioPositionsQuantileLongShort(val_y_yhats_df, 5)
        val_yhats = val_y_yhats_pos_df.yhats.values
        val_ys    = val_y_yhats_pos_df[lhs_col].values
        val_y_yhats_pos_df['returns'] = val_y_yhats_pos_df.position*val_y_yhats_pos_df[lhs_col]
        returns = val_y_yhats_pos_df[val_y_yhats_pos_df.position != 0].groupby('date')['returns'].sum().values
        assert len(val_yhats) == len(val_ys)

        # For this hp, form validation period statistics
        hps_results_dict['val_mse']       = np.mean(np.square(val_ys-val_yhats))
        hps_results_dict['val_r2_pred']   = 1-np.mean(np.square(val_ys-val_yhats))/np.mean(np.square(val_ys))
        hps_results_dict['val_yhat_min']  = np.min(val_yhats)
        hps_results_dict['val_yhat_q1']   = np.quantile(val_yhats, q=0.25)
        hps_results_dict['val_yhat_q2']   = np.quantile(val_yhats, q=0.5)
        hps_results_dict['val_yhat_mean'] = np.mean(val_yhats)
        hps_results_dict['val_yhat_q3']   = np.quantile(val_yhats, q=0.75)
        hps_results_dict['val_yhat_max']  = np.max(val_yhats)
        hps_results_dict['geom_mean_1h']  = calcGeomAvg(returns)
        hps_results_dict['sharpe_annual'] = calcSharpe(returns, periods_in_year=periods_in_year)
        hps_results_dict['sd_annual']     = calcSD(returns, periods_in_year=periods_in_year)
        hps_results_dict['max_dd']        = calcMaxDrawdown(returns)
        hps_results_dict['avg_turnover']  = calcTSAvgTurnover(val_y_yhats_pos_df)

        # Save results to return
        results_list.append(hps_results_dict)

        # For this hp, save results to csv
        cv_df = pd.DataFrame(results_list)
        timestr = time.strftime("%Y%m%d_%H%M%S")
        fp = cv_out_fp + '-' + arch_name + '-' + timestr + '.csv'
        cv_df.to_csv(fp, index=False)

    # Return cv results
    return results_list


In [7]:
def predictTestPeriod(df: pd.DataFrame, asset_universe_dict: Dict[str, list],
    test_start_date: str, lhs_col: str, opt_hps_dict: dict) -> pd.DataFrame:
    # Confirm separate df formed
    df = df.copy()

    # Determine RHS columns
    rhs_cols = list(df.columns.values)
    rhs_cols.remove('date')
    rhs_cols.remove('asset')
    rhs_cols.remove(lhs_col)

    # Determine test period datetimes to loop over and datetimes to refit at
    test_dts_dict = {}
    test_sun_midnights = np.unique(df[(df.date>=test_start_date) 
        & (df.date.dt.hour==0) & (df.date.dt.day_of_week==6)].date.values)

    # Check if first test date is sunday midnight, if not then add the dates
    first_test_datetime = np.min(df[(df.date==test_start_date)].date.values)
    day_of_week_of_first_test_datetime = (first_test_datetime.astype('datetime64[D]').view('int64') - 4) % 7
    if day_of_week_of_first_test_datetime != 6:
        test_dts_dict[first_test_datetime] = np.unique(df[(df.date>=first_test_datetime) & (df.date<test_sun_midnights[0])].date.values)

    # Complete the dictionary with all the sundays as keys as the dates until the next sunday as the testues
    for test_sun_midnight in test_sun_midnights:
        next_sun_midnight = test_sun_midnight + np.timedelta64(7, 'D')
        test_dts_dict[test_sun_midnight] = np.unique(df[(df.date>=test_sun_midnight) 
                                            & (df.date<next_sun_midnight)].date.values)
        
    # Create dataframe of results to return
    test_y_yhats_df = pd.DataFrame()

    # Loop over all the datetimes in the test period where we want to refit the model
    for test_datetime_start in list(test_dts_dict.keys()):
        # Monitor progress
        print('Currently fitting and predicting for the week starting: ')
        print(test_datetime_start)

        # Form end of this window
        test_datetime_end = np.max(test_dts_dict[test_datetime_start])

        # Form appropriate asset universe
        first_day_of_month_for_current_test_dt = np.datetime_as_string(test_datetime_start, unit='M')+'-01'
        asset_universe = asset_universe_dict[first_day_of_month_for_current_test_dt]

        # From relevant date-assets given asset universe and the train and oos dataframes
        rel_df   = df[(df.date<=test_datetime_end) & (df.asset.isin(asset_universe))].copy()
        train_df = rel_df[rel_df.date<test_datetime_start].copy()
        oos_df   = rel_df[(rel_df.date>=test_datetime_start) & (rel_df.date<=test_datetime_end)].copy()

        # Fit and predict
        models, _, _ = fitAutoencoder(train_df, opt_hps_dict, lhs_col, rhs_cols)
        yhats = genAutoencoderYhats(models, rel_df, rhs_cols, test_datetime_start, test_datetime_end, opt_hps_dict['number_factors'])
        del rel_df, train_df
        gc.collect()
        temp_yhats_df = oos_df[['date', 'asset', lhs_col]].copy()
        temp_yhats_df['yhats'] = yhats
        test_y_yhats_df = pd.concat([test_y_yhats_df, temp_yhats_df])

    return test_y_yhats_df

In [8]:
if __name__ == "__main__":
    # set args
    IN_FP           = '../data/clean/panel_train.pkl'
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    CV_OUT_FP       = '../output/high_dim_fm/cv_results'
    TEST_OUT_FP     = '../data/clean/test_yhats_autoencoder.pkl'
    LHS_COL         = 'r_ex_tp1'
    VAL_START_DATE  = '2021-01-01'
    VAL_END_DATE    = '2021-06-30'
    TEST_START_DATE = '2021-07-01'
    NUM_CPUS        = 22 
    PERIODS_IN_YEAR = 365*24
    ARCH_NAME       = 'autoencoder-12rhs'
    HP_GRID         = {'number_factors': [1],
        'num_hidden_layers': [1],
        'learning_rates': [5e-4, 1e-4, 5e-5], 
        'batch_sizes': [2048],
        'l1_penalties': [1e-2, 1e-3, 1e-4, 1e-5],
        'num_ensemble': [10],
        'early_stopping': [True],
        'epochs': [100]} 

    # read in data
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    all_df = pd.read_pickle(IN_FP)

    # drop rows and columns such that data will work for conditional autoencoder (CA)
    all_df = dropRowsAndColsForCA(all_df, LHS_COL)

    # form the char-sorted portfolios for factor side of CA input
    all_df = formPortfolioReturnCovariates(all_df, LHS_COL) # NOTE: ~7 min runtime

    # run CV
    cv_results_list = runCV(all_df, asset_universe_dict,
        VAL_START_DATE, VAL_END_DATE, TEST_START_DATE,
        LHS_COL, HP_GRID, PERIODS_IN_YEAR, CV_OUT_FP, ARCH_NAME)

    # choose optimal hyperparameter combination based on validation period predictive R^2
    opt_val_r2_pred = -1e6
    for cv_result in cv_results_list:
        if cv_result['val_r2_pred'] > opt_val_r2_pred:
            opt_val_r2_pred = cv_result['val_r2_pred']
            opt_cv_result = cv_result
    opt_hps_dict = dict(itertools.islice(opt_cv_result.items(), len(HP_GRID)+1))
    opt_hps_dict['epochs'] = int(opt_hps_dict['avg_epochs_trained'])
    opt_hps_dict['early_stopping'] = False
    del opt_hps_dict['avg_epochs_trained']

    # predict in test period with the optimal model
    test_y_yhats_df = predictTestPeriod(all_df, asset_universe_dict, TEST_START_DATE, LHS_COL, opt_hps_dict)
    test_y_yhats_df.to_pickle(TEST_OUT_FP)


{'number_factors': 1, 'num_hidden_layers': 1, 'learning_rates': 5e-05, 'batch_sizes': 2048, 'l1_penalties': 0.01, 'num_ensemble': 10, 'early_stopping': True, 'epochs': 100}
2021-01-01T00:00:00.000000000


2023-06-21 19:03:42.195337: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5281 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Ti, pci bus id: 0000:65:00.0, compute capability: 8.6


Epoch 1/100


2023-06-21 19:03:45.360857: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7fe6cc0031f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-06-21 19:03:45.360891: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Ti, Compute Capability 8.6
2023-06-21 19:03:45.381961: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-06-21 19:03:45.381998: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-06-21 19:03:45.626855: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-06-21 19:03:45.864456: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of 

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 1/100


In [None]:
# TODO CHECK THAT POSITION COLUMN IS CALCULATED PROPERLY

In [9]:
test_y_yhats_df = test_y_yhats_df.dropna()
test_y_yhats_df = test_y_yhats_df.sort_values(by=['date', 'asset'], ignore_index=True)
test_y_yhats_pos_df = formPortfolioPositionsQuantileLongShort(test_y_yhats_df, quantile=5)
test_yhats = test_y_yhats_pos_df.yhats.values
test_ys = test_y_yhats_pos_df[LHS_COL].values
test_y_yhats_pos_df['returns'] = test_y_yhats_pos_df.position*test_y_yhats_pos_df[LHS_COL]
returns = test_y_yhats_pos_df.groupby('date')['returns'].mean().values

In [10]:
1-np.mean(np.square(test_ys-test_yhats))/np.mean(np.square(test_ys))

-0.0434887624472069

In [11]:
np.min(test_yhats)

-0.021607466050075755

In [12]:
np.quantile(test_yhats, q=0.5)

0.0003495896315294781

In [13]:
np.max(test_yhats)

0.26090552840750597

In [14]:
calcGeomAvg(returns)

-3.425029307058125e-06

In [15]:
calcSharpe(returns, periods_in_year=24*365)

-13.900200553915258

In [None]:
# HP GRID FOR FINAL RUN:

# for 1 factor: lr 5e-5, 1e-5, 5e-6
# for factors [2] and learning_rates [1e-4, 5e-5, 1e-5]
# for factors [3] and learning_rates [5e-4, 5e-5, 5e-6]


In [None]:
# NOTES:

# Linear regression with the same RHS can fit in pre 2nd half of 2021 and predit in that period to get 5 bps r2perd and 10 bps tertile diff return

# Need returns above like 5 bps to beat transaction fees. Above 15 bps to beat just best single factor.

# For val period in first half of 2021, should see val_mse's between 2e-4 and 5e-4 and need < 5.2e-4 to be in the green.

# For the first half of 2021 val/test period, need a geom mean above 1.5e-6 to be (bootstrapped) stat diff from zero.
# For the second half of 2021 val/test period, need a geom mean above 5e-7 to be (bootstrapped) stat diff from zero.

# Based on my CV results, standard deviation of my "legit" mse's in first half of 2021 is something like 2e-3 so i need a mse improvement of 4e-3.

In [None]:
# TODO ALSO NEED TO TRY A MODEL WHERE WE SET MISSING ASSETS LHS 
# AND RHS TO -2 AND THEN WE HAVE BALANCED PANEL SO WE FIT ON THE 
# MATRIX OF RHS ACROSS ASSETS AS OPPOSED OT INDIVIDUAL DATE-ASSET 
# AND THEN THE NETWORK LEARNS ASSET-SPECIFIC PARAMETERS

In [None]:
# TODO GO SCOPE OLD NOTEBOOK FOR LATEST NUMBERS