In [8]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# IMPORT PACKAGES
from ipca import InstrumentedPCA
from typing import List, Dict
from datetime import datetime
from tools import QuantTools
import statsmodels.api as sm
import pandas as pd
import numpy as np
import pickle


In [9]:
def formIndexCrosswalk(all_df: pd.DataFrame) -> pd.DataFrame:
    """
    Form a crosswalk dataframe to new indices that are integer indices for use with IPCA package.
    """
    cross_df = all_df[['date', 'asset']].copy()
    cross_df['time'] = cross_df['date'].factorize()[0] + 1
    cross_df['asset_num'] = cross_df['asset'].factorize()[0] + 1
    return cross_df
    

In [10]:
def normalizeChars(df: pd.DataFrame, ipca_char_cols: List[str]) -> pd.DataFrame:
    """
    Normalizes the specified columns in the DataFrame using cross-sectional ranking.

    This function linearly spaces the values of the specified columns within each date
    to the range [-0.5, 0.5]. The values are ranked, divided by the number of assets for
    that date, and subtracted by 0.5.

    Args:
        df (pd.DataFrame): Input DataFrame containing the data.
        ipca_char_cols (List[str]): List of column names to be normalized.

    Returns:
        pd.DataFrame: A DataFrame with the specified columns normalized.
    """
    def normalize_within_date_group(column_name, group):
        # Calculate the number of assets for this date
        n = group.shape[0]
        
        # Add random noise to the values to ensure unique ranks
        noise = np.random.uniform(-1e-10, 1e-10, size=n)
        group[column_name] += noise
        
        # Rank the values, divide by the number of assets, and subtract 0.5
        group[column_name] = group[column_name].rank() / n - 0.5
        return group

    def normalize_column(df, column_name):
        # Apply the normalization to a specific column within each date group
        return df.groupby('time', group_keys=False).apply(lambda group: normalize_within_date_group(column_name, group))

    # Loop over the specified columns to normalize
    for col in ipca_char_cols:
        df = normalize_column(df, col)

    return df
    

In [11]:
def fitAndPredict(
    in_df: pd.DataFrame, lhs_col: str, test_int: int, ipca_char_cols: List[str], num_factor: int, num_cpus: int
    ) -> pd.DataFrame:
    """
    Fits and predicts data using Instrumented Principal Component Analysis (IPCA).

    :param in_df: Input DataFrame containing the data.
    :param lhs_col: Column name representing the left-hand side variable to predict.
    :param test_int: Integer value representing the threshold for splitting the data into training and out-of-sample sets.
    :param ipca_char_cols: List of column names used as characteristics for IPCA.
    :param num_factor: Number of factors to be used in IPCA.
    :param num_cpus: Number of CPUs to be utilized for parallel processing.
    
    :return: A DataFrame containing predictions (out-of-sample) with columns 'time', 'asset_num', and 'yhats'.
    """

    # Form datasets to fit ipca
    in_df    = in_df.sort_values(by=['time', 'asset_num'], ignore_index=True)
    train_df = in_df[in_df.time < test_int].copy()
    X_oos    = in_df[in_df.time == test_int].copy()

    Y_train = train_df[['time', 'asset_num', lhs_col]].copy()
    Y_train[lhs_col] = Y_train[lhs_col].astype('float64')
    X_train = train_df[['time', 'asset_num']+ipca_char_cols].copy()
    Y_train = Y_train.set_index(keys=['asset_num', 'time'], verify_integrity=True)
    Y_train = Y_train.squeeze() # convert to Series
    X_train = X_train.set_index(keys=['asset_num', 'time'], verify_integrity=True)
    X_train = X_train.astype('float64')

    X_oos = X_oos[['time', 'asset_num']+ipca_char_cols].copy()
    X_oos = X_oos.set_index(keys=['asset_num', 'time'], verify_integrity=True)
    X_oos = X_oos.astype('float64')

    # Fit
    ipca = InstrumentedPCA(n_factors=num_factor, intercept=True)
    ipca = ipca.fit(X=X_train, y=Y_train, data_type='panel', n_jobs=num_cpus)

    # Predict
    yhats = ipca.predict(X=X_oos, mean_factor=True)

    # Form results object to return
    oos_df          = in_df[['time', 'asset_num']][in_df.time == test_int].reset_index(drop=True)
    oos_df['yhats'] = yhats
    
    return oos_df


In [12]:
def fitAndPredictTestPeriod(
    df: pd.DataFrame, cross_df: pd.DataFrame, asset_universe_dict: dict, lhs_col: str, 
    test_start_int: int, num_factors: int, ipca_char_cols: List[str], num_cpus: int
    ) -> pd.DataFrame:
    # Initialize object for results
    test_df = df[df.time >= test_start_int][['time', 'asset_num', lhs_col]].reset_index(drop=True).copy()

    # Obtain all test period integers
    test_ints = np.unique(df[df.time>=test_start_int].time.values)

    # Loop over each model to gen yhats for
    for num_factor in range(1,1+num_factors):
        # iterate over all the test period weeks to gen yhats
        temp_dfs = []
        for test_int in list(test_ints):
            # form the relevant asset universe
            test_week = np.unique(cross_df[cross_df.time==test_int].date.values)[0]
            first_day_of_month = np.datetime64(test_week, 'M')
            first_day_of_month = np.datetime_as_string(first_day_of_month) + '-01'
            asset_universe = asset_universe_dict[first_day_of_month]
            asset_universe_ints = list(
                np.unique(cross_df[cross_df.asset.isin(asset_universe)].asset_num.values))

            # form the relevant dataset
            rel_df = df[(df.asset_num.isin(asset_universe_ints))
                & (df.time <= test_int)].copy()

            # fit and predict
            temp_df = fitAndPredict(rel_df, lhs_col, test_int, ipca_char_cols, num_factor, num_cpus)

            # save results across the test weeks
            temp_dfs.append(temp_df)
            
        # aggregate results across the test period for this combo
        temp_df = pd.concat(temp_dfs)
        temp_df = temp_df.rename(columns={'yhats': 'yhats_'+str(num_factor)+'_factors'})

        # merge results for this combo onto the main df
        test_df = test_df.merge(temp_df, on=['time', 'asset_num'], how='inner', validate='one_to_one')

    return test_df


In [13]:
def reportResults(
    df: pd.DataFrame, lhs_col: str, num_factors: int, 
    num_qntls_prtls: int, periods_in_year: int, model_prefix: str,
    out_fp: str, out_sheet: str, mcap_weighted: bool
    ) -> None:
    """
    Generates and reports portfolio statistics for a given number of factors.

    :param df: DataFrame containing the data to be analyzed.
    :param lhs_col: Name of the t+1 returns column.
    :param num_factors: The number of factors to consider.
    :param num_qntls_prtls: Number of quantiles for portfolio.
    :param periods_in_year: Number of periods in a year.
    :param model_prefix: name of the model as a prefix for the results.
    :param out_fp: Filepath for the output Excel file.
    :param out_sheet: Sheet name for the output in the Excel file.
    :param mcap_weighted: Boolean indicating if the results are market capitalization weighted.

    :return: None. The results are saved directly to the Excel file.
    """
    # Initialize results to return
    results_df = pd.DataFrame()

    # Generate results for each model
    for num_factor in range(1,1+num_factors):
        # Rename model's yhats and name
        df = df.rename(columns={'yhats_'+str(num_factor)+'_factors': 'yhats'})
        model_name =  model_prefix+str(num_factor)

        # Generate this model's portfolio statistics
        temp_results_df = QuantTools.calcPortfolioStatistics(
            df, lhs_col, 'yhats', 'macro_cmkt_tp7', model_name, 
            num_qntls_prtls, periods_in_year, mcap_weighted
        )

        # Append results
        results_df = pd.concat([results_df, temp_results_df])
        
        # Update yhat labels for next iteration
        df = df.rename(columns={'yhats': 'yhats_'+str(num_factor)+'_factors'})

    # Save the results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name=out_sheet)


In [None]:
if __name__ == "__main__":
    # set args
    IN_FP           = '../data/clean/panel_weekly.pkl'
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    DF_OUT_FP       = '../data/clean/test_yhats_ipca.pkl'
    OUT_FP          = '../output/low_dim_fm/low_dim_fms.xlsx'
    OUT_SHEET       = 'raw_ipca'
    LHS_COL         = 'r_ex_tp7'
    VAL_START_DATE  = '2021-07-01'
    TEST_START_DATE = '2022-07-01'
    PERIODS_IN_YEAR = 52
    NUM_QNTLS_PRTLS = 5
    NUM_FACTORS     = 5
    IPCA_CHAR_COLS = ['char_addr_new_log_delta_tm2_tm1',
        'char_beta_tm7',
        'char_iskew_tm30',
        'char_r_tm14',
        'char_shortfall5_tm7',
        'char_trades_t']
    NUM_CPUS = 22

    # read in data
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    all_df = pd.read_pickle(IN_FP)

    # form index crosswalk
    cross_df = formIndexCrosswalk(all_df)
    all_df = all_df.merge(cross_df, on=['date', 'asset'], how='inner', validate='one_to_one')

    # subset and normalize data
    df = all_df[['time', 'asset_num', LHS_COL]+IPCA_CHAR_COLS].copy()
    df = normalizeChars(df, IPCA_CHAR_COLS)

    # convert test start date to an interger
    test_start_week_datetime = np.min(all_df[all_df.date >= TEST_START_DATE].date.values)
    test_start_int = np.min(all_df[all_df.date==test_start_week_datetime].time)

    # generate test period yhats
    test_df = fitAndPredictTestPeriod(
        df, cross_df, asset_universe_dict, LHS_COL, 
        test_start_int, NUM_FACTORS, IPCA_CHAR_COLS, NUM_CPUS)
        
    # form dataframe to use for reporting results
    t_df = test_df.merge(cross_df, on=['time', 'asset_num'], how='inner', validate='one_to_one')
    t_df = t_df.drop(['time', 'asset_num'], axis=1)
    mcap_df = all_df[['date', 'asset', 'char_size_t']].copy()
    mcap_df = mcap_df.rename(columns={'char_size_t': 'mcap'})
    t_df = mcap_df.merge(t_df, on=['date', 'asset'], how='right', validate='one_to_one')

    # save ipca yhats
    out_df = t_df.drop(['mcap', LHS_COL], axis=1)
    out_df.to_pickle(DF_OUT_FP)

    # Form cmkt over future horizon for calc 5-1 strat alpha and beta; add to test results
    cmkt_df = all_df[['date', 'macro_cmkt_tm7']].drop_duplicates().copy()
    cmkt_df['macro_cmkt_tp7'] = cmkt_df.macro_cmkt_tm7.shift(-1)
    cmkt_df = cmkt_df.drop('macro_cmkt_tm7', axis=1)
    t_df = t_df.merge(cmkt_df, on=['date'], how='left', validate='many_to_one')

    # Report results
    reportResults(
        t_df, LHS_COL, NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'ipca_', OUT_FP, 'raw_ipca_mcap', True)
    reportResults(
        t_df, LHS_COL, NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'ipca_', OUT_FP, 'raw_ipca_equal', False)
