In [1]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# IMPORT PACKAGES
from typing import List
from sklearn import decomposition
from datetime import datetime
from tools import QuantTools
import statsmodels.api as sm
import pandas as pd
import numpy as np
import pickle


In [2]:
def subsetRhsAndRows(df: pd.DataFrame, lhs_col: str) -> pd.DataFrame:
    # Rename mcap column to use for mcap-weighted averages
    df = df.rename(columns = {'char_size_t': 'mcap', 'char_r_tm7': 'r_ex_tp0'})
    
    # Subset to relevant columns
    df = df[['date', 'asset', lhs_col, 'r_ex_tp0', 'mcap']].copy()

    return df


In [3]:
def determineFirstDayOfMonthsInTestPeriod(df: pd.DataFrame, test_start_date: str) -> List[str]:
    """
    Determine the first day of each month within the test period starting from the given date.

    :param df: DataFrame containing the date column for determining the test period range.
    :param test_start_date: The test start date in the format '%Y-%m-%d'.

    :return: A list of dates representing the first day of each month within the test period.
    """
    test_start_datetime = datetime.strptime(test_start_date, '%Y-%m-%d')
    assert test_start_datetime.day == 1, "Test start date does not start on first day of a month."
    
    test_period_months = []
    max_date = np.max(df.date)
    current_date = test_start_datetime
    while current_date <= max_date:
        test_period_months.append(current_date.strftime('%Y-%m-%d'))
        current_date += pd.DateOffset(months=1)
        
    return test_period_months


In [4]:
def buildFactorsWithPCA(in_df: pd.DataFrame, lhs_col: str, num_factor: int) -> pd.DataFrame:
    """
    Constructs factors using Principal Component Analysis (PCA) on a panel of asset returns.

    :param in_df: DataFrame containing the panel data of asset returns.
    :param lhs_col: The name of the column in in_df containing the t+1 asset returns.
    :param num_factor: The number of principal components to calc for factors.
    :return: DataFrame containing the factors constructed using PCA with a ``date'' column.

    The input DataFrame must have 'date', 'asset', and the specified lhs_col columns. 
    The function first reshapes the data into a T (time) x N (assets) matrix and computes
    the sample covariance matrix. It then performs PCA on this covariance matrix, retaining
    the specified number of factors, and returns them in a DataFrame.
    """
    # Reshape to TxN matrix of returns
    wide_df = in_df.pivot(index='date', columns='asset', values=lhs_col)

    # Fill missing with cross-sectional average
    wide_df = wide_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    assert(0 == wide_df.isnull().sum().sum())
    returns_mat = wide_df.values # NOTE: T x N
    returns_demeaned_mat = returns_mat - np.mean(returns_mat, axis=0)

    # Form sample covariance matrix
    cov = (np.matmul(np.transpose(returns_demeaned_mat), returns_demeaned_mat) 
        / returns_demeaned_mat.shape[0])

    # Form eigenvectors for given number of factors
    pca = decomposition.PCA(n_components=num_factor)
    pcs = pca.fit_transform(cov) # N x num_factor pc's
    pcs = pcs.astype(np.float64)

    # Form the factors
    factors = np.matmul(returns_mat, pcs)
    factors_df = pd.DataFrame(index=wide_df.index,
        data=factors, columns=['pca'+str(i) for i in range(1,1+num_factor)])

    return factors_df.reset_index()


In [5]:
def fitAndPredict(
    lhs_df: pd.DataFrame, lhs_col: str, factors_df: pd.DataFrame, oos_start_date: str
    ) -> pd.DataFrame:
    """
    Fits a multiple linear regression model to the given factors and asset returns, 
        and then uses the fitted model to predict out-of-sample (OOS) returns.

    :param lhs_df: DataFrame containing the left-hand-side (dependent) data, 
        including 'date', 'asset', and the specified lhs_col (return data).
    :param lhs_col: The name of the column in lhs_df containing the t+1 asset returns.
    :param factors_df: DataFrame containing the factor data along with a 'date' column.
    :param oos_start_date: The start date for the out-of-sample predictions.
    :return: DataFrame containing the out-of-sample predictions ('yhats') 
        along with the corresponding 'date', 'asset', and observed t+1 returns.

    The function first calculates the beta coefficients (beta hats) for each asset 
        using the data up to the oos_start_date. Then, it uses these coefficients 
        to make out-of-sample predictions for each asset for dates on or after the oos_start_date.

    If an asset in the test period does not exist in the training data, 
        the average of the beta hats from the training data is used for prediction.

    Note: The lhs_df and factors_df DataFrames must have a 'date' column,
        and the dates in lhs_df must correspond to the dates in factors_df.
    """

    # Form columns in factor df
    factor_cols = list(factors_df.columns)
    factor_cols.remove('date')

    # Calc each asset's beta hats
    asset_universe = list(np.unique(lhs_df[lhs_df.date<oos_start_date].asset.values))
    asset_beta_hats_dict = {key: None for key in asset_universe}
    for asset in asset_universe:
        # form relevant data
        train_lhs_df = lhs_df[(lhs_df.asset==asset) & (lhs_df.date < oos_start_date)][['date', lhs_col]].copy()
        train_df     = train_lhs_df.merge(factors_df, on='date', how='left', validate='one_to_one')
        train_rhs    = train_df[factor_cols]
        train_lhs    = train_df[lhs_col]
        train_rhs    = sm.add_constant(train_rhs)

        # calc beta hats
        model = sm.OLS(train_lhs, train_rhs)
        results = model.fit()

        # save beta hats
        asset_beta_hats_dict[asset] = list(results.params.values)

    # Calc avg beta hat in case we need it
    avg_beta_hats = np.mean(list(asset_beta_hats_dict.values()), axis=0)

    # Form test period rhs
    test_rhs = factors_df[factors_df.date>=oos_start_date]

    # Calc test yhats
    oos_df = lhs_df[lhs_df.date>=oos_start_date][['date', 'asset', lhs_col]].copy()
    oos_df = oos_df.sort_values(by=['date', 'asset'], ignore_index=True)
    oos_df['yhats'] = np.zeros(len(oos_df))
    test_assets = list(np.unique(oos_df.asset.values))
    for asset in test_assets:
        # if we did not have the test asset in training data, take average of beta hats
        if asset not in asset_beta_hats_dict.keys():
            asset_beta_hats = avg_beta_hats
        else:
            asset_beta_hats = np.array(asset_beta_hats_dict[asset])

        # form asset test rhs
        asset_test_dates = list(np.unique(oos_df[oos_df.asset==asset].date.values))
        asset_test_rhs = factors_df[factors_df.date.isin(asset_test_dates)][factor_cols]
        asset_test_rhs = sm.add_constant(asset_test_rhs)

        # calc asset yhats
        asset_yhats = np.matmul(asset_test_rhs.values, asset_beta_hats)

        # save values
        oos_df.loc[oos_df.asset==asset, 'yhats'] = asset_yhats

    return oos_df


In [6]:
def fitAndPredictTestPeriod(
    df: pd.DataFrame, asset_universe_dict: dict, lhs_col: str, test_start_date: str, num_factors: int
    ) -> pd.DataFrame:
    """
    Fits a multiple linear regression model using given number of factors built with PCA
        and predicts asset returns for the test period.

    :param df: DataFrame containing asset information, 
        including 'date', 'asset', lhs_col (return data) columns.
    :param asset_universe_dict: Dictionary mapping dates to a list of assets that are relevant for that date.
    :param lhs_col: The name of the column in df containing the asset returns.
    :param test_start_date: The start date for the test period predictions.
    :param num_factors: The maximum number of factors to consider in the PCA.
    :return: DataFrame containing the predicted returns (yhats) for each asset, 
        considering different numbers of factors used in the PCA.

    The function iterates over the test period months and uses the 
        buildFactorsWithPCA and fitAndPredict functions to generate predictions (yhats) 
        for various numbers of factors (from 1 to num_factors). 
        The predicted returns are aggregated and returned in a DataFrame.

    The DataFrame df must include the columns 'date', 'asset', and lhs_col.
        The dates in asset_universe_dict should correspond to the test period months, 
        and the assets should match those in df.

    Note: This function relies on the determineFirstDayOfMonthsInTestPeriod function 
        to get the first day of the month for all test period months, and on the 
        buildFactorsWithPCA and fitAndPredict functions for factor construction and prediction.
    """

    # Determine the first day of the month for all test period months
    test_period_months = determineFirstDayOfMonthsInTestPeriod(df, test_start_date)

    # Generate yhats for each model
    test_df = df[df.date >= test_start_date][['date', 'asset', lhs_col]].reset_index(drop=True).copy()
    for num_factor in range(1,1+num_factors):
        # iterate over all the test period months to gen yhats
        temp_dfs = []
        for test_period_month in test_period_months:
            # form the relevant dataset
            asset_universe = asset_universe_dict[test_period_month]
            one_month_ahead = datetime.strptime(test_period_month, '%Y-%m-%d') + pd.DateOffset(months=1)
            rel_df = df[(df.asset.isin(asset_universe)) 
                & (df.date < one_month_ahead)][['date', 'asset', lhs_col, 'r_ex_tp0']].copy()

            # form factors
            factors_df = buildFactorsWithPCA(rel_df, 'r_ex_tp0', num_factor)

            # fit and predict
            lhs_df = rel_df[['date', 'asset', lhs_col]].copy()
            temp_df = fitAndPredict(lhs_df, lhs_col, factors_df, test_period_month)

            # save results across the test months
            temp_dfs.append(temp_df)
        
        # aggregate results across the test period for this combo
        temp_df = pd.concat(temp_dfs)
        temp_df = temp_df.drop(lhs_col, axis=1)
        temp_df = temp_df.rename(columns={'yhats': 'yhats_'+str(num_factor)+'_factors'})

        # merge results for this combo onto the main df
        test_df = test_df.merge(temp_df, on=['date', 'asset'], how='inner', validate='one_to_one')

    return test_df


In [7]:
def reportResults(
    df: pd.DataFrame, lhs_col: str, num_factors: int, 
    num_qntls_prtls: int, periods_in_year: int, model_prefix: str,
    out_fp: str, out_sheet: str, mcap_weighted: bool
    ) -> None:
    """
    Generates and reports portfolio statistics for a given number of factors.

    :param df: DataFrame containing the data to be analyzed.
    :param lhs_col: Name of the t+1 returns column.
    :param num_factors: The number of factors to consider.
    :param num_qntls_prtls: Number of quantiles for portfolio.
    :param periods_in_year: Number of periods in a year.
    :param model_prefix: name of the model as a prefix for the results.
    :param out_fp: Filepath for the output Excel file.
    :param out_sheet: Sheet name for the output in the Excel file.
    :param mcap_weighted: Boolean indicating if the results are market capitalization weighted.

    :return: None. The results are saved directly to the Excel file.
    """
    # Initialize results to return
    results_df = pd.DataFrame()

    # Generate results for each model
    for num_factor in range(1,1+num_factors):
        # Rename model's yhats and name
        df = df.rename(columns={'yhats_'+str(num_factor)+'_factors': 'yhats'})
        model_name =  model_prefix+str(num_factor)

        # Generate this model's portfolio statistics
        temp_results_df = QuantTools.calcPortfolioStatistics(
            df, lhs_col, 'yhats', 'macro_cmkt_tp7', model_name, 
            num_qntls_prtls, periods_in_year, mcap_weighted
        )

        # Append results
        results_df = pd.concat([results_df, temp_results_df])

        # Update yhat labels for next iteration
        df = df.rename(columns={'yhats': 'yhats_'+str(num_factor)+'_factors'})

    # Save the results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name=out_sheet)


In [8]:
if __name__ == "__main__":
    # set args
    IN_FP           = '../data/clean/panel_weekly.pkl'
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    DF_OUT_FP       = '../data/clean/test_yhats_pca.pkl'
    OUT_FP          = '../output/low_dim_fm/low_dim_fms.xlsx'
    LHS_COL         = 'r_ex_tp7'
    VAL_START_DATE  = '2021-07-01'
    TEST_START_DATE = '2022-07-01'
    PERIODS_IN_YEAR = 52
    NUM_QNTLS_PRTLS = 5
    NUM_FACTORS     = 7
    
    # read in data
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    all_df = pd.read_pickle(IN_FP)

    # subset data
    df = subsetRhsAndRows(all_df, LHS_COL)

    # generate test period yhats
    test_df = fitAndPredictTestPeriod(
        df, asset_universe_dict, LHS_COL, TEST_START_DATE, NUM_FACTORS)

    # save yhats for later analysis
    out_df = test_df.copy()
    out_df = out_df.drop(LHS_COL, axis=1)
    out_df.to_pickle(DF_OUT_FP)

    # Form cmkt over future horizon for calc 5-1 strat alpha and beta; add to test results
    cmkt_df = all_df[['date', 'macro_cmkt_tm7']].drop_duplicates().copy()
    cmkt_df['macro_cmkt_tp7'] = cmkt_df.macro_cmkt_tm7.shift(-1)
    cmkt_df = cmkt_df.drop('macro_cmkt_tm7', axis=1)
    test_df = test_df.merge(cmkt_df, on=['date'], how='left', validate='many_to_one')
    
    # Update test results with mcap measure too
    test_df = test_df.merge(
        df[['date', 'asset', 'mcap']], on=['date', 'asset'], how='left', validate='one_to_one')

    # report results for both mcap and equal weighted ports
    reportResults(test_df, LHS_COL, NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'pca_', OUT_FP, 'raw_pca_mcap', True)
    reportResults(test_df, LHS_COL, NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'pca_', OUT_FP, 'raw_pca_equal', False)
