In [1]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# IMPORT PACKAGES
from typing import List, Dict
from tools import QuantTools
import statsmodels.api as sm
import pandas as pd
import numpy as np
import pickle


In [2]:
def subsetToAssetUniverse(df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]]) -> pd.DataFrame:
    """
    Subset a DataFrame based on a dictionary of asset universes.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame. Must contain columns "date" and "asset".
    asset_universe_dict : Dict[str, List[str]]
        A dictionary where keys are dates in 'YYYY-MM-DD' format and values are lists of asset names.

    Returns
    -------
    pd.DataFrame
        The subsetted DataFrame.
    """
    # Check that the required columns are present in the DataFrame
    if not set(['date', 'asset']).issubset(df.columns):
        raise ValueError('Input DataFrame must contain "date" and "asset" columns.')

    # Ensure that the 'date' column is of datetime type
    if df['date'].dtype != 'datetime64[ns]':
        df['date'] = pd.to_datetime(df['date'])

    # Loop over all months with their relevant assets
    for key, values in asset_universe_dict.items():
        # Extract the year and month from the key
        year, month = key.split('-')[:2]

        # Drop rows from the dataframe which match the year and month but not the assets
        df = df[~((df.date.dt.year == int(year)) 
                    & (df.date.dt.month == int(month)) 
                    & (~df.asset.isin(values)))]

    return df


In [3]:
def formUniFactor(in_df: pd.DataFrame, rhs_col: str, lhs_col: str, num_quantiles: int=5) -> pd.DataFrame:
    # Obtain factor name
    factor_col = 'factor_'+rhs_col[5:]

    # Obtain relevant data
    t_df = in_df[['date', 'asset', lhs_col, rhs_col, 'char_mcap_t']].copy()
    t_df = t_df.rename(columns={'char_mcap_t': 'mcap'})

    # Form quantiles
    t_df['rank_within_date'] = t_df.groupby('date')[rhs_col].rank(method='first')
    t_df['rank_ratio'] = t_df.groupby('date')['rank_within_date'].transform(lambda x: x / x.max())
    quantile_bins = list(np.arange(0, num_quantiles+1)/num_quantiles)
    t_df['quant'] = 1+pd.cut(t_df['rank_ratio'], bins=quantile_bins, labels=False, include_lowest=True)
    t_df = t_df.drop(columns=['rank_within_date', 'rank_ratio'])

    # Calculate the average return for each quantile within each date
    t_df['weighted_return'] = t_df[lhs_col] * t_df.mcap
    grouped_df = t_df.groupby(['date', 'quant'])[['weighted_return', 'mcap']].sum().reset_index()
    grouped_df[lhs_col] = grouped_df['weighted_return'] / grouped_df['mcap']
    date_quantile_avg_returns_df = grouped_df[['date', 'quant', lhs_col]].copy()

    # Calculate the 5-1 return time series
    diff_date_quantile_avg_returns_df = date_quantile_avg_returns_df.pivot_table(index='date', columns='quant', values=lhs_col)
    hml_df = pd.DataFrame(diff_date_quantile_avg_returns_df[num_quantiles] 
        - diff_date_quantile_avg_returns_df[1])

    # Clean up
    hml_df.columns = [factor_col]
    hml_df = hml_df.reset_index()

    return hml_df

In [4]:
def runContempRegAndReportResults(
    factors_df: pd.DataFrame, models_df: pd.DataFrame, out_fp: str, out_sheet: str) -> None:
    # obtain factors to study as lhs variables
    factor_cols = list(factors_df.columns.values)
    factor_cols.remove('date')

    # Put together the factors and model yhats
    reg_df = factors_df.merge(models_df, on='date', how='inner', validate='one_to_one')

    # Initialize DataFrame for the results
    results_df = pd.DataFrame()

    # Generate results for each uni factor
    for factor_col in factor_cols:
        # Initialize result object for this factor
        result_df = pd.DataFrame(index=[0, 1])

        # Add factor name
        result_df['uni_factor'] = factor_col

        # Prep lhs and rhs
        y = reg_df[factor_col].values
        X = reg_df[['multi', 'pca', 'ipca']].copy()
        X = sm.add_constant(X)

        # Run OLS
        T = reg_df.shape[0]
        maxlags = int(4 * (T / 100)**(2/9))
        model = sm.OLS(y, X)
        results = model.fit(cov_type='HAC', cov_kwds={'maxlags': maxlags})

        # obtain, format, and store results
        for i, col in zip(range(4), ['alpha', 'multi', 'pca', 'ipca']):
            # obtain
            coef  = results.params[i]
            se    = results.bse[i]
            tstat = results.tvalues[i]

            # format
            coef    = str(np.round(coef, 4))
            se      = "("+str(np.round(se, 4))+")"
            
            # add significant
            if np.abs(tstat) >= 2.326:
                coef = coef+"***"
            elif np.abs(tstat) >= 1.96:
                coef = coef+"**"
            elif np.abs(tstat) >= 1.645:
                coef = coef+"*"
            
            # store results
            result_df.loc[0, col] = coef
            result_df.loc[1, col] = se

        # Obtain and store r2
        result_df['r2'] = np.round(results.rsquared, 2)

        # Append results
        results_df = pd.concat((results_df, result_df))

    # Save the results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name=out_sheet)


In [5]:
if __name__ == "__main__":
    # set args
    PANEL_IN_FP     = '../data/clean/panel_weekly.pkl' 
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    IN_MULTI_FP     = '../data/clean/test_yhats_multivariate.pkl'
    IN_PCA_FP       = '../data/clean/test_yhats_pca.pkl'
    IN_IPCA_FP      = '../data/clean/test_yhats_ipca.pkl'
    OUT_FP          = '../output/low_dim_fm/low_dim_fms.xlsx'
    OUT_SHEET       = 'raw_alpha'
    SIG_UNI_COLS    = ['char_r_tm14',
        'char_r_industry_tm30',
        'char_r_industry_tm60',
        'char_beta_tm7',
        'char_iskew_tm30',
        'char_illiq_tm7']
    LHS_COL         = 'r_ex_tp7'

    # read in data
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    all_df = pd.read_pickle(PANEL_IN_FP)
    multi_df = pd.read_pickle(IN_MULTI_FP)
    pca_df   = pd.read_pickle(IN_PCA_FP)
    ipca_df  = pd.read_pickle(IN_IPCA_FP)
    
    # form uni factors
    char_df = all_df[['date', 'asset', LHS_COL, 'char_mcap_t']+SIG_UNI_COLS].copy()
    char_df = subsetToAssetUniverse(char_df, asset_universe_dict)
    factors_df = pd.DataFrame(data={'date': []})
    for rhs_col in SIG_UNI_COLS:
        factor_df = formUniFactor(char_df, rhs_col, LHS_COL)
        factors_df = factors_df.merge(factor_df, on=['date'], how='outer', validate='one_to_one')

    # select best models based on low dim results and rename
    multi_df = multi_df.rename(columns={'yhats_2_factors': 'yhats_multi'})
    multi_df = multi_df[['date', 'asset', 'yhats_multi']].copy()
    pca_df = pca_df.rename(columns={'yhats_4_factors': 'yhats_pca'})
    pca_df = pca_df[['date', 'asset', 'yhats_pca']].copy()
    ipca_df = ipca_df.rename(columns={'yhats_3_factors': 'yhats_ipca'})
    ipca_df = ipca_df[['date', 'asset', 'yhats_ipca']].copy()

    # put results together
    df = multi_df.merge(pca_df, on=['date', 'asset'], how='inner', validate='one_to_one')
    df = df.merge(ipca_df, on=['date', 'asset'], how='inner', validate='one_to_one')

    # add lhs and mcap to low dim model yhats
    df = df.merge(all_df[['date', 'asset', LHS_COL, 'char_mcap_t']], on=['date', 'asset'], how='inner', validate='one_to_one')
    df = df.rename(columns={'char_mcap_t': 'mcap'})

    # calc low dim returns
    temp_df = QuantTools.formPortfolioWeightsByQuantile(df, 5, True, 'yhats_multi')
    temp_df['return'] = temp_df.prtfl_wght_hml*temp_df[LHS_COL]
    returns_multi = temp_df.groupby('date')[['return']].sum().values.reshape(-1)
    temp_df = QuantTools.formPortfolioWeightsByQuantile(df, 5, True, 'yhats_pca')
    temp_df['return'] = temp_df.prtfl_wght_hml*temp_df[LHS_COL]
    returns_pca = temp_df.groupby('date')[['return']].sum().values.reshape(-1)
    temp_df = QuantTools.formPortfolioWeightsByQuantile(df, 5, True, 'yhats_ipca')
    temp_df['return'] = temp_df.prtfl_wght_hml*temp_df[LHS_COL]
    returns_ipca = temp_df.groupby('date')[['return']].sum().values.reshape(-1)
    models_df = pd.DataFrame(data={
        'date': multi_df.date.unique(),
        'multi': returns_multi,
        'pca': returns_pca,
        'ipca': returns_ipca
    })

    # Run and report results
    runContempRegAndReportResults(factors_df, models_df, OUT_FP, OUT_SHEET)
    