In [1]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# IMPORT PACKAGES
from typing import List, Tuple
from joblib import Parallel, delayed
from itertools import combinations
from datetime import datetime
from tools import QuantTools
import statsmodels.api as sm
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle


In [2]:
def subsetAndFormRhs(df: pd.DataFrame, lhs_col: str) -> pd.DataFrame:
    # Define char columns of interest
    char_cols = ['char_circulation_tm7',
        'char_circulation_tm30',
        'char_circulation_tm90',
        'char_tx_volume_t',
        'char_tx_volume_tm7',
        'char_addr_new_tm1',
        'char_addr_new_tm7',
        'char_addr_active_tm1',
        'char_addr_active_tm7',
        'char_addr_new_log_delta_tm14_tm7',
        'char_age_destroyed_tm7',
        'char_age_mean_dollar_t',
        'char_delta_flow_dist_tm7',
        'char_delta_holders_dist_tm7',
        'char_prct_supply_in_profit_t',
        'char_exchange_prct_circ_supply_t',
        'char_cex_prct_circ_supply_t',
        'char_dex_prct_circ_supply_t',
        'char_defi_prct_circ_supply_t',
        'char_traders_prct_circ_supply_t',
        'char_exchange_inflow_tm7',
        'char_exchange_outflow_tm7',
        'char_rank_cmc_t',
        'char_tradable_t',
        'char_social_volume_tm7',
        'char_social_volume_reddit_tm7',
        'char_social_volume_twitter_tm7',
        'char_sent_neg_reddit_tm7',
        'char_sent_neg_twitter_tm7',
        'char_sent_pos_reddit_tm7',
        'char_sent_pos_twitter_tm7',
        'char_sent_volume_consumed_tm7',
        'char_social_dom_avg_tm7',
        'char_dev_activity_tm7',
        'char_vc_t',
        'char_r_tm7',
        'char_r_tm14',
        'char_r_tm14_tm7',
        'char_r_tm30',
        'char_r_tm60',
        'char_r_tm90',
        'char_r_max_tm7',
        'char_r_max_tm30',
        'char_r_ath_t',
        'char_r_atl_t',
        'char_trades_t',
        'char_trades_sum_tm7',
        'char_trades_std_tm7',
        'char_volume_sum_tm7',
        'char_volume_std_tm7',
        'char_ask_t',
        'char_bid_t',
        'char_bidask_t',
        'char_ask_size_t',
        'char_bid_size_t',
        'char_illiq_tm7',
        'char_turnover_tm7',
        'char_price_t',
        'char_size_t',
        'char_mvrv_t',
        'char_alpha_tm7',
        'char_alpha_tm30',
        'char_beta_tm7',
        'char_beta_tm30',
        'char_beta_downside_tm30',
        'char_coskew_tm30',
        'char_iskew_tm30',
        'char_shortfall5_tm7',
        'char_shortfall5_tm90',
        'char_var5_tm7',
        'char_var5_tm90',
        'char_vol_tm7',
        'char_vol_tm30',
        'char_ivol_tm7',
        'char_ivol_tm30']
    
    # Subset to relevant columns
    df = df[['date', 'asset', lhs_col]+char_cols].copy()

    # Form new rhs of the contemporaneous return and mcap to use throughout
    df['r_ex_tp0'] = df['char_r_tm7']
    df['mcap'] = df['char_size_t']

    return df


In [3]:
def buildFactorsDf(in_df: pd.DataFrame, lhs_col: str, num_qntls_fctrs: int) -> pd.DataFrame:
    # Build list of rhs variables: char and factor names
    char_cols = list(in_df.columns.values)
    for col in ['date', 'asset', lhs_col, 'r_ex_tp0', 'mcap']:
        char_cols.remove(col)
    factor_cols = ['factor_'+col[5:] for col in char_cols]

    # Randomly sort all rows of the dataframe, 
    #     to handle chars that have repeated values within date
    in_df = in_df.sample(frac=1).reset_index(drop=True)

    # Form factors
    factors_df = pd.DataFrame(data={'date': []})
    for char_col, factor_col in zip(char_cols, factor_cols):
        # subset to relevant characteristic
        temp_df = in_df[['date', 'r_ex_tp0', 'mcap', char_col]]

        # form quantiles by char col
        temp_df = temp_df.sort_values(by=['date', char_col])
        temp_df['rank_within_date'] = temp_df.groupby('date')[char_col].rank(method='first')
        temp_df['rank_ratio'] = temp_df.groupby('date')['rank_within_date'].transform(lambda x: x / x.max())
        quantile_bins = list(np.arange(0, num_qntls_fctrs+1)/num_qntls_fctrs)
        temp_df['quant'] = 1+pd.cut(temp_df['rank_ratio'], bins=quantile_bins, labels=False, include_lowest=True)
        temp_df = temp_df.drop(columns=['rank_within_date', 'rank_ratio'])

        # form mcap weighted average return within date-quantiles
        temp_df['weighted_return'] = temp_df['r_ex_tp0'] * temp_df.mcap
        grouped_df = temp_df.groupby(['date', 'quant'])[['weighted_return', 'mcap']].sum().reset_index()
        grouped_df['r_ex_tp0'] = grouped_df['weighted_return'] / grouped_df['mcap']
        avg_ret_by_quant_df = grouped_df[['date', 'quant', 'r_ex_tp0']].copy()

        # form top minus bottom mcap weighted contemporaneous return
        pivot_df = avg_ret_by_quant_df.pivot(index='date', columns='quant', values='r_ex_tp0')
        pivot_df['diff'] = pivot_df[num_qntls_fctrs] - pivot_df[1]
        factor_df = pivot_df.reset_index()[['date', 'diff']]
        factor_df = factor_df.rename(columns={'diff': factor_col})

        # combine results
        factors_df = factors_df.merge(factor_df, on='date', how='outer', validate='one_to_one')

    return factors_df


In [4]:
def fitAndPredict(lhs_df: pd.DataFrame, lhs_col: str, factors_df: pd.DataFrame, factors_combo: List[str], test_start_date_str: str) -> pd.DataFrame:
    # Calc each asset's beta hats
    asset_universe = list(np.unique(lhs_df[lhs_df.date<test_start_date_str].asset.values))
    asset_beta_hats_dict = {key: None for key in asset_universe}
    for asset in asset_universe:
        # form relevant data
        train_lhs_df = lhs_df[(lhs_df.asset==asset) & (lhs_df.date < test_start_date_str)][['date', lhs_col]].copy()
        train_df = train_lhs_df.merge(factors_df, on='date', how='left', validate='one_to_one')
        train_rhs = train_df[factors_combo]
        train_lhs = train_df[lhs_col]
        train_rhs = sm.add_constant(train_rhs)

        # calc beta hats
        model = sm.OLS(train_lhs, train_rhs)
        results = model.fit()

        # save beta hats
        asset_beta_hats_dict[asset] = list(results.params.values)

    # Calc avg beta hat in case we need it
    avg_beta_hats = np.mean(list(asset_beta_hats_dict.values()), axis=0)

    # Form test period rhs
    test_rhs = factors_df[factors_df.date>=test_start_date_str]

    # Calc test yhats
    test_df = lhs_df[lhs_df.date>=test_start_date_str][['date', 'asset', lhs_col]].copy()
    test_df = test_df.sort_values(by=['date', 'asset'], ignore_index=True)
    test_df['yhats'] = np.zeros(len(test_df))
    test_assets = list(np.unique(test_df.asset.values))
    for asset in test_assets:
        # if we did not have the test asset in training data, take average of beta hats
        if asset not in asset_beta_hats_dict.keys():
            asset_beta_hats = avg_beta_hats
        else:
            asset_beta_hats = np.array(asset_beta_hats_dict[asset])

        # form asset test rhs
        asset_test_dates = list(np.unique(test_df[test_df.asset==asset].date.values))
        asset_test_rhs = factors_df[factors_df.date.isin(asset_test_dates)][factors_combo]
        asset_test_rhs = sm.add_constant(asset_test_rhs)

        # calc asset yhats
        asset_yhats = np.matmul(asset_test_rhs.values, asset_beta_hats)

        # save values
        test_df.loc[test_df.asset==asset, 'yhats'] = asset_yhats

    return test_df


In [5]:
def determineFirstDayOfMonthsInTestPeriod(df: pd.DataFrame, test_start_date: str) -> List[str]:
    """
    Determine the first day of each month within the test period starting from the given date.

    :param df: DataFrame containing the date column for determining the test period range.
    :param test_start_date: The test start date in the format '%Y-%m-%d'.

    :return: A list of dates representing the first day of each month within the test period.
    """
    test_start_datetime = datetime.strptime(test_start_date, '%Y-%m-%d')
    assert test_start_datetime.day == 1, "Test start date does not start on first day of a month."
    
    test_period_months = []
    max_date = np.max(df.date)
    current_date = test_start_datetime
    while current_date <= max_date:
        test_period_months.append(current_date.strftime('%Y-%m-%d'))
        current_date += pd.DateOffset(months=1)
        
    return test_period_months


In [6]:
def cvFactorCombos(df: pd.DataFrame, asset_universe_dict: dict, lhs_col: str, test_start_date: str, 
    num_qntls_fctrs: int, num_cpus: int, num_factors: int) -> Tuple[list, float]:
    """
    Compute optimal combination of characteristics with the maximum predicted r-squared value.

    :param df: Input DataFrame with necessary columns.
    :param asset_universe_dict: Dictionary of keys of study period months and values of lists of assets.
    :param lhs_col: Column name in df for left-hand side variable.
    :param test_start_date: The test start date in the format '%Y-%m-%d'.
    :param num_qntls_fctrs: Number of quantiles for factors.
    :param num_cpus: Number of CPU cores to use for parallel processing.
    :param num_factors: Number of factors for combinations.

    :return: A tuple containing a list of optimal characteristic combinations and the associated max r-squared value.
    """
    # Build list of all the char cols
    char_cols = [col for col in df.columns if col not in ['date', 'asset', lhs_col, 'r_ex_tp0', 'mcap']]

    # Form all combinations of factors
    char_combos = list(combinations(char_cols, num_factors))

    # Determine the first day of the month for all test period months
    test_period_months = determineFirstDayOfMonthsInTestPeriod(df, test_start_date)

    # Parallel loop over all combinations of characteristics to find combo with opt r2_pred
    def findOptCharCombo(char_combo):
        # convert char_combo to a list and form factor names
        char_combo = list(char_combo)
        factors_combo = ['factor_'+col[5:] for col in char_combo]

        # iterate over all months in the test period to use the appropriate assets
        test_dfs = []
        for test_period_month in test_period_months:
            # form the relevant dataset
            asset_universe = asset_universe_dict[test_period_month]
            one_month_ahead = datetime.strptime(test_period_month, '%Y-%m-%d') + pd.DateOffset(months=1)
            rel_df = df[(df.asset.isin(asset_universe)) 
                & (df.date < one_month_ahead)][
                ['date', 'asset', lhs_col, 'mcap', 'r_ex_tp0']+char_combo].copy()

            # form factors
            factors_df = buildFactorsDf(rel_df, lhs_col, num_qntls_fctrs)

            # fit and predict
            lhs_df = rel_df[['date', 'asset', lhs_col]].copy()
            test_df = fitAndPredict(lhs_df, lhs_col, factors_df, factors_combo, test_period_month)

            # save results across the test months
            test_dfs.append(test_df)
        
        # aggregate results across test period for this combo
        result_df = pd.concat(test_dfs)

        # calc r2 pred 
        ys = result_df[lhs_col].values
        yhats = result_df['yhats'].values
        r2_pred = 1-np.mean(np.square(ys-yhats))/np.mean(np.square(ys))

        # return results
        return r2_pred
        # r2_pred_list.append(r2_pred)

    # Run loop in parallel
    r2_pred_list = Parallel(n_jobs=num_cpus)(delayed(findOptCharCombo)(char_combo) for char_combo in tqdm(char_combos))

    # Determine optimal r2 pred and return it with associated combination
    max_index = np.argmax(np.array(r2_pred_list))
    return list(char_combos[max_index]), r2_pred_list[max_index]


In [7]:
def fitTestPeriod(df: pd.DataFrame, asset_universe_dict: dict, lhs_col: str, test_start_date: str, 
    num_qntls_fctrs: int, num_factors: int, opt_chars_combos_list: List[list]) -> None:
    # Determine the first day of the month for all test period months
    test_period_months = determineFirstDayOfMonthsInTestPeriod(df, test_start_date)

    # Generate yhats for each model
    test_df = df[df.date >= test_start_date][['date', 'asset', lhs_col, 'mcap']].reset_index(drop=True).copy()
    for i, opt_chars_combo in enumerate(opt_chars_combos_list):
        # form factor names
        factors_combo = ['factor_'+col[5:] for col in opt_chars_combo]
        num_factors = i+1

        # iterate over all the test period months to gen yhats
        temp_dfs = []
        for test_period_month in test_period_months:
            # form the relevant dataset
            asset_universe = asset_universe_dict[test_period_month]
            one_month_ahead = datetime.strptime(test_period_month, '%Y-%m-%d') + pd.DateOffset(months=1)
            rel_df = df[(df.asset.isin(asset_universe)) 
                & (df.date < one_month_ahead)][
                ['date', 'asset', lhs_col, 'mcap', 'r_ex_tp0']+opt_chars_combo].copy()

            # form factors
            factors_df = buildFactorsDf(rel_df, lhs_col, num_qntls_fctrs)

            # fit and predict
            lhs_df = rel_df[['date', 'asset', lhs_col]].copy()
            temp_df = fitAndPredict(lhs_df, lhs_col, factors_df, factors_combo, test_period_month)

            # save results across the test months
            temp_dfs.append(temp_df)

        # aggregate results across the test period for this combo
        temp_df = pd.concat(temp_dfs)
        temp_df = temp_df.drop(lhs_col, axis=1)
        temp_df = temp_df.rename(columns={'yhats': 'yhats_'+str(num_factors)+'_factors'})

        # merge results for this combo onto the main df
        test_df = test_df.merge(temp_df, on=['date', 'asset'], how='inner', validate='one_to_one')

    return test_df


In [8]:
def reportResults(opt_chars_combos_list: List[list],
    df: pd.DataFrame, lhs_col: str, num_factors: int, 
    num_qntls_prtls: int, periods_in_year: int, model_prefix: str,
    out_fp: str, out_sheet: str, mcap_weighted: bool
    ) -> None:
    """
    Generates and reports portfolio statistics for a given number of factors.

    :param df: DataFrame containing the data to be analyzed.
    :param lhs_col: Name of the t+1 returns column.
    :param num_factors: The number of factors to consider.
    :param num_qntls_prtls: Number of quantiles for portfolio.
    :param periods_in_year: Number of periods in a year.
    :param model_prefix: name of the model as a prefix for the results.
    :param out_fp: Filepath for the output Excel file.
    :param out_sheet: Sheet name for the output in the Excel file.
    :param mcap_weighted: Boolean indicating if the results are market capitalization weighted.

    :return: None. The results are saved directly to the Excel file.
    """
    # Initialize results to return
    results_df = pd.DataFrame()

    # Generate results for each model
    for num_factor in range(1,1+num_factors):
        # Rename model's yhats and name
        df = df.rename(columns={'yhats_'+str(num_factor)+'_factors': 'yhats'})
        model_name =  model_prefix+str(num_factor)

        # Generate this model's portfolio statistics
        temp_results_df = QuantTools.calcPortfolioStatistics(
            df, lhs_col, 'yhats', 'macro_cmkt_tp7', model_name, 
            num_qntls_prtls, periods_in_year, mcap_weighted
        )

        # Append variables used
        # NOTE: This is different for this notebook
        temp_results_df['chars'] = " ".join(opt_chars_combos_list[num_factor-1])

        # Append results
        results_df = pd.concat([results_df, temp_results_df])
        
        # Update yhat labels for next iteration
        df = df.rename(columns={'yhats': 'yhats_'+str(num_factor)+'_factors'})

    # Save the results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name=out_sheet)


In [9]:
if __name__ == "__main__":
    # set args
    IN_FP           = '../data/clean/panel_weekly.pkl'
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    OUT_FP          = '../output/low_dim_fm/low_dim_fms.xlsx'
    DF_OUT_FP       = '../data/clean/test_yhats_multivariate.pkl'
    LHS_COL         = 'r_ex_tp7'
    VAL_START_DATE  = '2021-07-01'
    TEST_START_DATE = '2022-07-01'
    PERIODS_IN_YEAR = 52
    NUM_QNTLS_FCTRS = 5
    NUM_QNTLS_PRTLS = 5
    NUM_FACTORS     = 3
    NUM_CPUS        = 18
    
    # read in data
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    all_df = pd.read_pickle(IN_FP)

    # remove unncessary RHS columns
    df = subsetAndFormRhs(all_df, LHS_COL)

    # determine optimal chars for 1-5 factor models
    val_df = df[df.date<TEST_START_DATE].copy()
    # opt_chars_combos_list = []
    # opt_r2_pred_list = []
    # for i in range(1,1+NUM_FACTORS):
    #     print(f'Running {i} factors validation.')
    #     opt_chars_combo, opt_r2_pred = cvFactorCombos(
    #         val_df, asset_universe_dict, LHS_COL, VAL_START_DATE, 
    #         NUM_QNTLS_FCTRS, NUM_CPUS, i)
    #     opt_chars_combos_list.append(opt_chars_combo)
    #     opt_r2_pred_list.append(opt_r2_pred)
    #     print(f'Selected {i} factors are: {opt_chars_combo} \n')
    
    # manually set to gen results as run time is ~30 hours for 3 factors; 30 days for just 4 factors
    opt_chars_combos_list = [['char_mvrv_t'],
        ['char_delta_holders_dist_tm7', 'char_ask_size_t'],
        ['char_sent_neg_twitter_tm7', 'char_mvrv_t', 'char_var5_tm7']]
    opt_r2_pred_list = [-0.13420703668470768, -0.2017464388161141, -0.4618389903200375]

    # generate test period yhats
    test_df = fitTestPeriod(df, asset_universe_dict,
        LHS_COL, TEST_START_DATE, NUM_QNTLS_FCTRS, NUM_FACTORS,
        opt_chars_combos_list)
    
    # save yhats for later analysis
    out_df = test_df.copy()
    out_df = out_df.drop([LHS_COL, 'mcap'], axis=1)
    out_df.to_pickle(DF_OUT_FP)

    # Form cmkt over future horizon for calc 5-1 strat alpha and beta; add to test results
    cmkt_df = all_df[['date', 'macro_cmkt_tm7']].drop_duplicates().copy()
    cmkt_df['macro_cmkt_tp7'] = cmkt_df.macro_cmkt_tm7.shift(-1)
    cmkt_df = cmkt_df.drop('macro_cmkt_tm7', axis=1)
    test_df = test_df.merge(cmkt_df, on=['date'], how='left', validate='many_to_one')

    # Report results
    reportResults(opt_chars_combos_list,
        test_df, LHS_COL, NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'multi_', OUT_FP, 'raw_multi_mcap', True)
    reportResults(opt_chars_combos_list,
        test_df, LHS_COL, NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'multi_', OUT_FP, 'raw_multi_equal', False)


In [10]:
# TODO -also do a func to build the liu 3 factor model to add its yhats to report within this func
