In [14]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# IMPORT PACKAGES
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from typing import Dict, List, Tuple
from joblib import Parallel, delayed
import pandas_datareader as pdr
import statsmodels.api as sm
from scipy.stats import norm
from tools import QuantTools
import pandas as pd
import numpy as np
import itertools
import pickle
import random
import time


In [15]:
def subsetAndNormalizeChars(df: pd.DataFrame, lhs_col: str, first_year: int) -> pd.DataFrame:
   # drop unneeded data to reduce T
   df = df[df.date.dt.year >= first_year].copy()

   # Set characteristics of interest  
   selected_rhs = ['char_tx_volume_tm7',
      'char_addr_active_tm7',
      'char_age_destroyed_tm7',
      'char_delta_flow_dist_tm7',
      'char_delta_holders_dist_tm7',
      'char_prct_supply_in_profit_t',
      'char_exchange_inflow_tm7',
      'char_exchange_outflow_tm7',
      'char_num_pairs_t',
      'char_social_volume_tm7',
      'char_social_volume_reddit_tm7',
      'char_social_volume_twitter_tm7',
      'char_sent_pos_reddit_tm7',
      'char_sent_pos_twitter_tm7',
      'char_sent_neg_reddit_tm7',
      'char_sent_neg_twitter_tm7',
      'char_r_tm7',
      'char_r_tm14',
      'char_r_tm90',
      'char_r_tm90_tm30',
      'char_r_ath_t',
      'char_r_atl_t',
      'char_r_industry_tm30',
      'char_r_industry_tm60',
      'char_trades_sum_tm7',
      'char_volume_sum_tm7',
      'char_spread_bps_t',
      'char_ask_size_t',
      'char_bid_size_t',
      'char_illiq_tm7',
      'char_turnover_tm7',
      'char_size_t',
      'char_alpha_tm7',
      'char_alpha_tm30',
      'char_beta_tm7',
      'char_beta_tm30',
      'char_coskew_tm30',
      'char_iskew_tm30',
      'char_shortfall5_tm7',
      'char_var5_tm7',
      'char_vol_tm7',
      'char_vol_tm30',
      'char_vol_tm90']

   # Cut to characteristics columns of interest
   df = df[['date', 'asset', lhs_col]+selected_rhs]

   # Normalize characteristics to be between 0 and 1.
   for col in selected_rhs:
      df[col] = (df.groupby('date')[col].rank() - 1) / (df.groupby('date')[col].transform('count') - 1)
   assert 0 == df[selected_rhs].min().min()
   assert 1 == df[selected_rhs].max().max()

   return df


In [16]:
def runLasso(Y: np.ndarray, X: np.ndarray, penalty: float) -> np.ndarray:
    # perform lasso
    lasso = Lasso(alpha = penalty)
    lasso.fit(X, Y)
    
    # return fitted coefficients
    return lasso.coef_

def calcPenaltyBCCH(Y: np.ndarray, X: np.ndarray, c: float) -> float:
    ''' This function applies Belloni, Chen, Chernozhukov, Hansen 2012 ECMA
        closed-form solution for selecting Lasso penalty parmaeter.

    Args: 
        X (np.ndarray): RHS variables with rows of obs and covar_cols of covars.
                        These data include a constant but have yet to be
                        normalized for lasso.
        Y (np.ndarray): LHS variable with rows of obs and single column.
        c (float):    scalar constant from theory; usually ~1.

    Returns:
        penalty (float): BCCH penalty parameter.
    '''
    # Bickel Ritov Tsybakov constant parameter selection
    a = 0.1

    # calc pilot penalty parameter
    N = X.shape[0]
    p = X.shape[1]
    max_moment_xy = np.max(np.mean((X**2)*(Y**2), axis =0)**0.5) 
    penalty_pilot = 2*c*norm.ppf(1-a/(2*p))*max_moment_xy/np.sqrt(N)

    # run lasso with pilot penalty parameter
    beta_hat = runLasso(Y, X, penalty_pilot)
    
    # set BCCH penalty parameter
    residuals = Y - np.matmul(X, beta_hat).reshape(-1,1)
    max_moment_xepi = np.max(np.mean((X**2)*(residuals**2), axis =0)**0.5) 
    penalty = 2*c*norm.ppf(1-a/(2*p))*max_moment_xepi/np.sqrt(N)

    return penalty

def runOLS(Y: np.ndarray, X: np.ndarray) -> np.ndarray:
    ''' Runs OLS of Y on X to return fitted coefficients.

    Args: 
        X (np.ndarray): RHS--assumes contains constant--with rows of obs and covar_cols of covars.
        Y (np.ndarray): LHS variable with rows of obs and single column.

    Returns:
        beta_hat (np.ndarray): vector of fitted coefficients.
    '''
    if np.linalg.matrix_rank(X) < min(X.shape):
        num_cols_to_drop = int(X.shape[1] * 0.5)
        X = X[:, :-num_cols_to_drop]
        
        if np.linalg.matrix_rank(X) < min(X.shape):
            num_cols_to_drop = int(X.shape[1] * 0.5)
            X = X[:, :-num_cols_to_drop]
            if np.linalg.matrix_rank(X) < min(X.shape):
                print(X.shape)

                raise ValueError("Matrix is still singular after dropping columns.")
    
    return np.matmul(np.linalg.inv(np.matmul(np.transpose(X), X)),
                        np.matmul(np.transpose(X), Y))

def runDoubleSelectionLasso(Y: np.ndarray, D: np.ndarray, X: np.ndarray, c: float,
    selected_prct_upper: float=0.3, selected_prct_lower: float=0.05) -> float:
    ''' Runs Double Selection Lasso from Belloni et al (2014).

    Args: 
        Y (np.ndarray): LHS variable with rows of obs and single column.
        D (np.ndarray): RHS target variable with rows of obs and single column.
        X (np.ndarray): RHS controls with rows of obs and p cols of characteristics.
        c (float): scalar constant from theory; usually ~1.
        selected_prct_upper (float): upper bound on number of columns selected.
        selected_prct_lower (float): lower bound on number of columns selected.
    
    Returns:
        alpha_hat (float): estimated target coefficient.
    '''
    # initialize a percent selected outside range
    selected_prct_cols = 1

    while ((selected_prct_cols > selected_prct_upper) 
        | (selected_prct_cols < selected_prct_lower)):
        # update scalar constant
        if (selected_prct_cols > selected_prct_upper):
            c = 1.1*c
        else:
            c = 0.9*c

        # lasso of Y on D and X to select elements of X, I_1_hat
        X_all = np.hstack((D,X))
        beta_hat_1 = runLasso(Y, X_all, penalty=calcPenaltyBCCH(Y, X_all, c=c))

        # lasso of D on X to select elements of X, I_2_hat
        beta_hat_2 = runLasso(D, X, penalty=calcPenaltyBCCH(D, X, c=c))

        # form union of I_1_hat and I_2_hat
        i_1_hat = list(np.nonzero(beta_hat_1)[0] -1 ) # NOTE: subtracting 1 as we added the treatment var to RHS
        if -1 in i_1_hat: i_1_hat.remove(-1) # remove treatment variable if it was included
        i_2_hat = list(np.nonzero(beta_hat_2)[0])
        i_hat   = list(set(i_1_hat).union(set(i_2_hat)))

        # update percent of columns that were selected
        selected_prct_cols = len(i_hat) / X.shape[1]

    # OLS of Y on D plus included Xs
    X_sel    = X[:,i_hat]
    X_all    = np.hstack((D, X_sel))
    beta_hat = runOLS(Y, X_all)
    alpha_hat = beta_hat[0,0]

    # return target parameter on D
    return alpha_hat

def fitBaiPCA(
    matrix: np.ndarray, T: int, k: int, p: int
    ) -> Tuple[np.ndarray, np.ndarray]:
    # Calculate the scaling factor
    scaling_factor = 1 / (T * p)

    # Form the target, symmetric, positive semi-definite matrix
    target_matrix = scaling_factor * (matrix @ matrix.T)

    # Calculate eigenvalues and vectors
    eigenvalues, eigenvectors = np.linalg.eigh(target_matrix)

    # Calculate factors and loadings
    factors = np.sqrt(T) * eigenvectors[:, -k:][:, ::-1]
    loadings = (matrix.T @ factors) / T

    # Confirm factors are scaled appropriately
    identity = (factors.T @ factors) / T
    assert(np.isclose(k, np.sum(np.abs(np.diagonal(identity)))))

    return factors, loadings

def softThresholdRows(matrix, sparse_prct=0.2):
    """ Implement ell_1 soft thresholding across rows of the given matrix. """
    dim2 = matrix.shape[1]
    ell_1_norm_rows = np.sum(np.abs(matrix), axis=1)
    lmbd = np.quantile(ell_1_norm_rows, 1-sparse_prct)
    row_mask = 1*(ell_1_norm_rows > lmbd)
    mat_mask = np.repeat(row_mask, dim2).reshape(-1, dim2)
    return matrix*mat_mask

def fitDSLFM(R: List[np.ndarray], Z: List[np.ndarray],
    k: int, p: int, c: float, num_cpus: int, sparse_prct: float=0.2) -> tuple:
    # Figure out number cpus to use for outer and inner loops assuming we have at least 4
    assert(num_cpus >= 4)
    n_jobs_outer = int(num_cpus / 4)
    n_jobs_inner = 4

    # Determine number of time periods
    T = len(R)

    def runForEachCharacteristic(j):
        # form indices
        minus_j = list(range(p))
        minus_j.remove(j)
        
        def runForEachTimePeriod(t):
            # form, for this rhs var j, this time periods LHS, target, and controls
            Y = R[t].reshape(-1,1)
            D = Z[t][:,j].reshape(-1,1)
            X = Z[t][:,minus_j]

            # estimate c_{t,j}, i.e. target coef
            c_t_j = runDoubleSelectionLasso(Y, D, X, c)

            return c_t_j

        C_t_hat = Parallel(n_jobs=n_jobs_inner)(delayed(runForEachTimePeriod)(t) for t in range(T))
        
        return C_t_hat

    # Estimate C hat matrix
    C_hat = Parallel(n_jobs=n_jobs_outer)(delayed(runForEachCharacteristic)(j) for j in range(p))
    C_hat = np.array(C_hat).transpose()

    # Demean C_hat for this version of the estimators
    C_hat_d = C_hat - np.mean(C_hat, axis=0)

    # Use PCA to decompose C_hat into estimated factors and loadings
    factors_hat, loadings_hat = fitBaiPCA(C_hat, T, k, p)
    Gamma_beta_hat = loadings_hat

    # Use PCA to decompose C_hat_d into estimated factors V and loadings \G_\b^d
    factors_v_hat, loadings_d_hat = fitBaiPCA(C_hat_d, T, k, p)

    # Soft threshold Gamma beta hat and Gamma_beta_d_hat
    Gamma_beta_check = softThresholdRows(Gamma_beta_hat, sparse_prct)

    return Gamma_beta_hat, factors_hat, Gamma_beta_check, factors_v_hat

def predictDSLFM(df: pd.DataFrame, lhs_col: str, char_cols: List[str], oos_date: str, window: int,
    gamma_beta_check: np.ndarray, factors_hat: np.ndarray):
    # Confirm parameters are valid
    assert (window <= len(factors_hat)) # rolling avg window is less than length of factors

    # Build rhs
    z = df[df.date==oos_date][char_cols].values
    lambda_hat = factors_hat[-window:,:].mean(axis=0)

    # Predict
    yhats = z @ gamma_beta_check @ lambda_hat

    # Build output data
    out_df = df[df.date == oos_date][['date', 'asset', lhs_col]]
    out_df['yhats'] = yhats

    return out_df

In [17]:
def cvDSLFM(df: pd.DataFrame, lhs_col: str, val_start_date: str, test_start_date: str,
    num_factors: int, asset_universe_dict: Dict[str, str], num_cpus: int,
    periods_in_year: int, num_qntls_prtls: int, cv_out_fp: str) -> List[dict]:
    # Obtain char col names and set p
    cols = list(df.columns.values)
    for col in ['date', 'asset', lhs_col]:
        cols.remove(col)
    char_cols = cols.copy()
    p = len(char_cols)

    # Remove unneeded data
    df = df[df.date < test_start_date].copy()

    # Initilize cv results object
    cv_results_lst = []

    # Determine val datetimes to loop over
    val_dates = list(df[df.date >= val_start_date].date.unique())

    # Form hps to iterate over
    hp_grid = {'C': [0.05], 
        'window': [20, 25, 30, 35],
        'st': [0.05, 0.1, 0.15, 0.2, 0.25]}

    # Loop over hp combinations
    keys = hp_grid.keys()
    values = hp_grid.values()
    hp_combos = list(itertools.product(*values))
    for hps in hp_combos:
        # Start the timer
        tic = time.perf_counter()

        # Create hp dictionary and other objects for this iteration
        hps_dict = dict(zip(keys, hps))
        hps_results_dict = hps_dict.copy()
        val_y_yhats_df = pd.DataFrame()

        # Report on progress
        print(hps_dict)

        # Loop over val dates
        for val_date in val_dates:
            # Monitor progress
            print(val_date)

            # Obtain all train dates before this val date
            train_datetimes = list(df[df.date < val_date].date.unique())

            # Form appropriate asset universe
            first_day_of_month_for_current_val_dt = np.datetime_as_string(val_date, unit='M')+'-01'
            asset_universe = asset_universe_dict[first_day_of_month_for_current_val_dt]

            # Subset to asset universe
            val_df = df[df.asset.isin(asset_universe)].copy()

            # Form necessary matrices of data to fit and predict on
            R = []
            Z = []
            for train_date in train_datetimes:
                R.append(val_df[val_df.date==train_date][lhs_col].values)
                Z.append(val_df[val_df.date==train_date][char_cols].values)

            # Fit DSLFM
            (gamma_beta_hat, factors_hat, gamma_beta_check, factors_v_hat) = fitDSLFM(
                R, Z, num_factors, p, hps_dict['C'], num_cpus, hps_dict['st'])
            
            # Predict DSLFM 
            temp_y_yhats_df = predictDSLFM(val_df, lhs_col, char_cols, val_date, hps_dict['window'], gamma_beta_check, factors_hat)

            # Save this week's results
            val_y_yhats_df = pd.concat([val_y_yhats_df, temp_y_yhats_df])
        
        # Stop the timer after this hp grid point is completed
        toc = time.perf_counter()

        # Obtain validation period results
        val_yhats      = val_y_yhats_df.yhats.values
        val_ys         = val_y_yhats_df[lhs_col].values
        val_y_yhats_pos_df = QuantTools.formPortfolioWeightsByQuantile(val_y_yhats_df, num_qntls_prtls)
        val_y_yhats_pos_df['returns'] = val_y_yhats_pos_df.prtfl_wght_hml*val_y_yhats_pos_df[lhs_col]
        returns = (val_y_yhats_pos_df.groupby('date')['returns'].sum().values)

        # Add results to dict
        hps_results_dict['train-start_year']    = np.min(df.date.dt.year)
        hps_results_dict['num_factors']    = num_factors
        hps_results_dict['runtime']        = round((toc - tic)/60, 0)
        hps_results_dict['val_mse']        = QuantTools.calcMSE(val_ys, val_yhats)
        hps_results_dict['val_r2_pred']    = QuantTools.calcR2Pred(val_ys, val_yhats)
        hps_results_dict['geom_mean']      = QuantTools.calcGeomAvg(returns)
        hps_results_dict['sharpe_annual']  = QuantTools.calcSharpe(returns, periods_in_year=periods_in_year)
        hps_results_dict['sd_annual']      = QuantTools.calcSD(returns, periods_in_year=periods_in_year)
        
        # Save results to return
        cv_results_lst.append(hps_results_dict)

        # For this hp, save results to csv
        cv_df = pd.DataFrame(cv_results_lst)
        timestr = time.strftime("%Y%m%d_%H%M%S")
        fp = cv_out_fp + '-' + timestr + '.csv'
        cv_df.to_csv(fp, index=False)

    return cv_results_lst
    

In [18]:
def selectOptHps(cv_results_lst: List[dict]) -> dict:
    best_sharpe = -2
    for hps_dict in cv_results_lst:
        if hps_dict['sharpe_annual'] > best_sharpe:
            best_sharpe = hps_dict['sharpe_annual']
            opt_hps_dict = hps_dict

    return opt_hps_dict

In [19]:
def predictOOS(df: pd.DataFrame, lhs_col: str, test_start_date: str, num_factors: int,
    asset_universe_dict: dict, num_cpus: int, opt_hps_dict: dict) -> tuple:
    # Initialize objects for results
    test_df = pd.DataFrame()

    # Obtain char col names
    cols = list(df.columns.values)
    for col in ['date', 'asset', lhs_col]:
        cols.remove(col)
    char_cols = cols.copy()

    # Set p
    p = len(char_cols)

    # Determine test dates to loop over
    test_dates = list(df[df.date >= test_start_date].date.unique())

    # Loop over test dates
    for test_date in test_dates:
        # Monitor progress
        print(test_date)
        
        # Obtain all train dates before this test date
        train_datetimes = list(df[df.date < test_date].date.unique())

        # Form appropriate asset universe
        first_day_of_month_for_current_val_dt = np.datetime_as_string(test_date, unit='M')+'-01'
        asset_universe = asset_universe_dict[first_day_of_month_for_current_val_dt]

        # Subset to asset universe
        oos_df = df[df.asset.isin(asset_universe)].copy()

        # Form necessary matrices of data to fit and predict on
        R = []
        Z = []
        for train_date in train_datetimes:
            R.append(oos_df[oos_df.date==train_date][lhs_col].values)
            Z.append(oos_df[oos_df.date==train_date][char_cols].values)
        
        # Fit DSLFM
        (gamma_beta_hat, factors_hat, gamma_beta_check, factors_v_hat) = fitDSLFM(
            R, Z, num_factors, p, opt_hps_dict['C'], num_cpus)
        
        # Predict DSLFM 
        temp_y_yhats_df = predictDSLFM(oos_df, lhs_col, char_cols, test_date, opt_hps_dict['window'], gamma_beta_check, factors_hat)
        
        # Save this week's results
        test_df = pd.concat([test_df, temp_y_yhats_df])

    return test_df.reset_index(drop=True)

In [20]:
def reportOOSResults(
    df: pd.DataFrame, lhs_col: str,
    num_factors: int, num_qntls_prtls: int, periods_in_year: int, model_prefix: str,
    out_fp: str, out_sheet: str, mcap_weighted: bool
    ) -> None:
    # Initialize results to return
    results_df = pd.DataFrame()

    # Generate results for each model
    for num_factor in range(1,1+num_factors):
        # Rename model's yhats and name
        df = df.rename(columns={'yhats_'+str(num_factor)+'_factors': 'yhats'})
        model_name =  model_prefix+str(num_factor)

        # Generate this model's portfolio statistics
        temp_results_df = QuantTools.calcPortfolioStatistics(
            df, lhs_col, 'yhats', 'macro_cmkt_tp7', model_name, 
            num_qntls_prtls, periods_in_year, mcap_weighted
        )

        # Append results
        results_df = pd.concat([results_df, temp_results_df])
        
        # Update yhat labels for next iteration
        df = df.rename(columns={'yhats': 'yhats_'+str(num_factor)+'_factors'})

    # Save the results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name=out_sheet)


In [21]:
def genBootstrapCharImportResults(df: pd.DataFrame, 
    lhs_col: str, asset_universe_dict: dict, opt_hps_dict: dict, 
    out_fp: str, num_cpus: int, num_bs_samps: int=200) -> None:
    # Obtain hps
    num_factors = opt_hps_dict['num_factors']
    C = opt_hps_dict['C']

    # Obtain char col names
    cols = list(df.columns.values)
    for col in ['date', 'asset', lhs_col]:
        cols.remove(col)
    char_cols = cols.copy()

    # Prepare data
    p = len(char_cols)
    asset_universe = asset_universe_dict['2022-12-01']
    df = df[df.asset.isin(asset_universe)]
    train_datetimes = list(df.date.unique())
    R = []
    Z = []
    for train_date in train_datetimes:
        R.append(df[df.date==train_date][lhs_col].values)
        Z.append(df[df.date==train_date][char_cols].values)

    # gen given number of bootstrap samples
    gamma_beta_hats = []
    for bs in range(num_bs_samps):
        # Randomly sample what dates to fit on
        np.random.seed(bs)
        bs_weeks_indices = np.random.randint(0, len(train_datetimes), len(train_datetimes))
        R_bs = [R[i] for i in bs_weeks_indices]
        Z_bs = [Z[i] for i in bs_weeks_indices]

        # Fit DSLFM
        (gamma_beta_hat, factors_hat, gamma_beta_check, factors_v_hat) = fitDSLFM(
            R_bs, Z_bs, num_factors, p, C, num_cpus)
        
        # Save results
        gamma_beta_hats.append(gamma_beta_hat)

    # form results df
    results_df = pd.DataFrame(data={'char': char_cols})

    # calc point est
    summed_chars_gamma_beta_hats = [np.sum(np.square(gamma_beta_hat), axis=1) 
                                for gamma_beta_hat in gamma_beta_hats]
    point_ests_per_bs = np.vstack(summed_chars_gamma_beta_hats)
    assert (num_bs_samps == point_ests_per_bs.shape[0])
    estimates_arr = np.mean(point_ests_per_bs, axis=0)
        
    # calc se
    ses_arr = np.std(point_ests_per_bs, axis=0)

    # format estimate for output
    t_stat_arr = estimates_arr / ses_arr
    estimates_list = []
    for i in range(len(estimates_arr)):
        t_stat = t_stat_arr[i]
        if np.abs(t_stat) > 2.576:
            estimate_str = str(estimates_arr[i])+"***"
        elif np.abs(t_stat) > 1.96:
            estimate_str = str(estimates_arr[i])+"**"
        elif np.abs(t_stat) > 1.645:
            estimate_str = str(estimates_arr[i])+"*"
        else:
            estimate_str = str(estimates_arr[i])    
        estimates_list.append(estimate_str)

    # add results
    results_df['est'] = estimates_list
    results_df['se'] = ses_arr

    # sort
    results_df = results_df.sort_values('est', ascending=False)

    # save results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name='raw_char_imp')


In [22]:
def obtainExpInflationDf(week_dts: np.ndarray, inf_col: str='EXPINF10YR') -> pd.DataFrame:
    # obtain data
    ei_df = pdr.DataReader(inf_col, 'fred', '2017-01-01').reset_index()

    # clean up data
    ei_df = ei_df.rename(columns={'DATE': 'date'})

    # convert to weekly rate from annual
    ei_df[inf_col] = (1+ei_df[inf_col]/100)**(1/52)-1 

    # convert to weekly freq and interpolate to fill missing
    ei_df = ei_df.merge(pd.DataFrame(data={'date': week_dts}),
                on='date', how='outer', validate='one_to_one')
    ei_df = ei_df.sort_values(by='date', ascending=True)
    ei_df[inf_col] = ei_df[inf_col].interpolate(method='linear')

    # keep only dts in og data
    ei_df = ei_df[ei_df.date.isin(list(week_dts))]

    # convert to pct change
    ei_df = ei_df.set_index('date')
    ei_df = ei_df.pct_change()
    ei_df = ei_df.fillna(0)
    ei_df = ei_df.reset_index()

    return ei_df

In [23]:
def calcAsympVar(Z: list, Z_bar: np.ndarray, ob_factor: np.ndarray, factors_v_hat: np.ndarray,
    gamma_beta_hat: np.ndarray, gamma_hat: np.ndarray, eta_hat: np.ndarray, 
    T: int, k: int, p: int):
    # Reshape
    gamma_hat = gamma_hat.reshape(-1, 1)
    eta_hat   = eta_hat.reshape(-1, 1)
    
    # Calculate the residuals from the time series OLS
    residuals = ob_factor.reshape(-1,1) - np.matmul(factors_v_hat, eta_hat)

    # Calculate the Z_t_j_jprime scalar
    Z_tjjp = np.zeros((T,p,p))
    for t in range(T):
        for j in range(p):
            zitj_bar = np.mean(Z[t][:,j])
            Z_tjjp[t, j, :] = (Z[t]*zitj_bar).mean(axis=0)
            Z_tjjp[t, j, :] *= (Z[t].shape[0])**(-1)

    Z_tjjp *= (T)**(-1)

    # Calculate the Pi_t scalar
    Pi = np.zeros((T,k,k))
    for t in range(T):
        Pi_t = 0
        for j in range(p):
            for jp in range(p):
                gamma_beta_jp = gamma_beta_hat[jp,:]
                gamma_beta_j  = gamma_beta_hat[j,:]
                Pi_t += gamma_beta_jp @ gamma_beta_j.T * Z_tjjp[t,j,jp]
        Pi[t] = Pi_t

    # Calc the components of asymp matrix
    Phi_11 = T**(-1) * (factors_v_hat.T @ residuals) @ (residuals.T @ factors_v_hat)

    Phi_22 = np.zeros((k,k))
    for t in range(T):
        for tp in range(T):
            Phi_22 += Pi[t,:,:] @ factors_v_hat[t,:] * factors_v_hat[tp,:].T @ Pi[tp,:,:].T
    Phi_22 *= T**(-1)

    Phi_12 = np.zeros((k,k))
    for t in range(T):
        for tp in range(T):
            Phi_12 += factors_v_hat[t,:] * residuals[t] * factors_v_hat[tp,:].T @ Pi[tp,:,:].T
    Phi_12 *= T**(-1)

    # Calc design matrices
    A = (factors_v_hat.T @ factors_v_hat) / T

    B = (gamma_beta_hat.T @ Z_bar.T @ Z_bar @ gamma_beta_hat) / Z_bar.shape[0]

    # Calculate the target variance
    gamma_hat = gamma_hat.reshape(-1, 1)
    eta_hat   = eta_hat.reshape(-1, 1)
    sigma_g_2 = (gamma_hat.T @ np.linalg.inv(A) @ Phi_11 @ np.linalg.inv(A.T) @ gamma_hat
                    + eta_hat.T @ np.linalg.inv(B) @ Phi_22 @ np.linalg.inv(B.T) @ eta_hat
                    + gamma_hat.T @ np.linalg.inv(A) @ Phi_12 @ np.linalg.inv(B.T) @ eta_hat
                    + eta_hat.T @ np.linalg.inv(B) @ Phi_12.T @ np.linalg.inv(A.T) @ gamma_hat)

    return sigma_g_2

In [24]:
def infRiskPremium(df: pd.DataFrame, asset_universe_dict: dict,
    lhs_col: str, inf_col: str, opt_hps_dict: dict, num_cpus: int) -> None:
    # Obtain observable factor
    week_dts = df.date.unique()
    ob_df = obtainExpInflationDf(week_dts, inf_col)

    # Obtain char col names and set p
    cols = list(df.columns.values)
    for col in ['date', 'asset', lhs_col]:
        cols.remove(col)
    char_cols = cols.copy()

    # Set params
    T = len(df.date.unique())
    p = len(char_cols)
    k = opt_hps_dict['num_factors']
    C = opt_hps_dict['C']
    soft_threshold_prct = opt_hps_dict['st']

    # Calc Z bar
    z_bar_list = []
    assets = list(df.asset.unique())
    for asset in assets:
        asset_z_df = df[df.asset==asset][char_cols].values
        asset_z_bar = np.mean(asset_z_df, axis=0)
        z_bar_list.append(asset_z_bar)
    Z_bar = np.vstack(z_bar_list)

    # Obtain datetimes to fit over
    train_datetimes = list(df.date.unique())

    # Subset to asset universe
    asset_universe = asset_universe_dict['2022-12-01']
    train_df = df[df.asset.isin(asset_universe)].copy()

    # Form necessary matrices of data to fit on
    R = []
    Z = []
    for train_date in train_datetimes:
        R.append(train_df[train_df.date==train_date][lhs_col].values)
        Z.append(train_df[train_df.date==train_date][char_cols].values)

    # Fit DSLFM
    (gamma_beta_hat, factors_hat, gamma_beta_check, factors_v_hat) = fitDSLFM(
        R, Z, k, p, C, num_cpus, soft_threshold_prct)

    # Estimate risk premium
    G = ob_df[inf_col].values
    gamma_hat   = factors_hat.mean(axis=0)
    eta_hat     = runOLS(G, factors_hat-gamma_hat)
    gamma_g_hat = np.dot(eta_hat, gamma_hat)
    gamma_g_var = calcAsympVar(Z, Z_bar, G, factors_hat-gamma_hat, gamma_beta_hat, gamma_hat, eta_hat, T, k, p)

    # Output results
    print(f"Inflation risk premium: {gamma_g_hat}")
    print(f"Standard error: {np.sqrt(gamma_g_var[0][0])}")
    print(f"Number of time periods: {len(ob_df)}")
    print(f"Number of assets: {Z_bar.shape[0]}")


In [27]:
if __name__ == "__main__":
    # set args
    IN_FP           = '../data/clean/panel_weekly.pkl'
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    CV_OUT_FP       = '../output/high_dim_fm/dslfm_cv'
    OUT_FP          = '../output/high_dim_fm/dslfm.xlsx'
    LHS_COL         = 'r_ex_tp7'
    VAL_START_DATE  = '2022-01-01'
    TEST_START_DATE = '2022-07-01'
    PERIODS_IN_YEAR = 52
    NUM_QNTLS_PRTLS = 5
    NUM_FACTORS     = 5
    NUM_CPUS        = 20

    # read in data
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    all_df = pd.read_pickle(IN_FP)

    # subset to relevant data
    df = subsetAndNormalizeChars(all_df, LHS_COL, 2020)

    # form dataframe to use for reporting results
    cmkt_df = all_df[['date', 'macro_cmkt_tm7']].drop_duplicates().copy()
    cmkt_df['macro_cmkt_tp7'] = cmkt_df.macro_cmkt_tm7.shift(-1)
    cmkt_df = cmkt_df.drop('macro_cmkt_tm7', axis=1)
    mcap_df = all_df[['date', 'asset', 'char_mcap_t']].copy()
    mcap_df = mcap_df[mcap_df.date >= TEST_START_DATE]
    mcap_df = mcap_df.rename(columns={'char_mcap_t': 'mcap'})
    aux_df = mcap_df.merge(cmkt_df, on=['date'], how='left', validate='many_to_one')
    aux_df = aux_df.fillna(0.003) # NOTE: cmkt return last week of 2022
    
    # gen results for given number of factors
    yhats_df = pd.DataFrame()
    opt_hps_dicts = []
    for num_factors in range(1,NUM_FACTORS+1):
        # cv dslfm for optimal penalty param and predict oos
        cv_results_lst = cvDSLFM(df, LHS_COL, VAL_START_DATE, TEST_START_DATE,
            num_factors, asset_universe_dict, NUM_CPUS, PERIODS_IN_YEAR, NUM_QNTLS_PRTLS, CV_OUT_FP)
        
        # select optimal hp point
        opt_hps_dict = selectOptHps(cv_results_lst)
        opt_hps_dicts.append(opt_hps_dict)
        
        # gen oos results
        test_df = predictOOS(df, LHS_COL, TEST_START_DATE, num_factors, asset_universe_dict, NUM_CPUS, opt_hps_dict)

        # format oos results
        test_df = test_df.rename(columns={'yhats': 'yhats_'+str(num_factors)+'_factors'})
        if num_factors == 1:
            yhats_df = test_df.copy()
        else:
            test_df = test_df.drop(LHS_COL, axis=1)
            yhats_df = yhats_df.merge(test_df, on=['date', 'asset'], how='inner', validate='one_to_one')

    # add aux data to yhats
    yhats_df = yhats_df.merge(aux_df, on=['date', 'asset'], how='inner', validate='one_to_one')

    # Report oos results
    reportOOSResults(
        yhats_df, LHS_COL, 
        NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'dslfm_mcap_', OUT_FP, 'raw_oos_mcap', True)
    reportOOSResults(
        yhats_df, LHS_COL, 
        NUM_FACTORS, NUM_QNTLS_PRTLS, PERIODS_IN_YEAR, 
        'dslfm_equal_', OUT_FP, 'raw_oos_equal', False)
    
    # Obtain optimal results across factors
    opt_hps_dict = selectOptHps(opt_hps_dicts)

    # Gen char import
    genBootstrapCharImportResults(df, LHS_COL, asset_universe_dict, opt_hps_dict, OUT_FP, NUM_CPUS) # NOTE: 1 HOUR RUNTIME

    # Calculate inflation risk premium
    infRiskPremium(df, asset_universe_dict, LHS_COL, 'EXPINF10YR', opt_hps_dict, NUM_CPUS)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Inflation risk premium: 0.00013824213421103075
Standard error: 9.675459918701172e-08
Number of time periods: 156
Number of assets: 210
