In [6]:
# Data Extraction
import pandas as pd 
import numpy as np
portfolios = pd.read_excel('./../homework/data/factor_pricing_data.xlsx', sheet_name = 'portfolios (excess returns)').set_index('Date')
factors = pd.read_excel('./../homework/data/factor_pricing_data.xlsx', sheet_name = 'factors (excess returns)').set_index('Date')
portfolios.head()

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-01-31,-0.0076,0.0285,0.0084,0.1009,-0.0143,0.1002,0.0362,0.0323,0.0048,0.0059,...,0.0158,0.0875,0.0465,-0.0126,0.043,-0.0283,0.0258,0.0768,0.0308,0.0669
1980-02-29,0.0105,-0.0608,-0.0966,-0.0322,-0.0569,-0.0323,-0.0521,-0.08,-0.0555,-0.0167,...,-0.0079,-0.0541,-0.0346,-0.0639,-0.0652,-0.0854,-0.0959,-0.0347,-0.0282,-0.0274
1980-03-31,-0.2224,-0.1119,-0.0167,-0.1469,-0.0193,-0.1271,-0.0826,-0.1237,-0.0566,-0.0668,...,-0.0819,-0.1509,-0.1098,-0.0906,-0.1449,-0.056,-0.088,-0.2451,-0.1254,-0.1726
1980-04-30,0.0449,0.0766,0.0232,0.0321,0.083,-0.0529,0.0783,0.0153,0.0304,0.0115,...,0.042,-0.0103,-0.0312,0.0353,0.0542,0.0728,0.053,0.0977,0.0447,0.0769
1980-05-31,0.0632,0.0793,0.0457,0.0863,0.0815,0.0509,0.0324,0.0886,0.056,0.0098,...,0.0564,0.1063,0.1142,0.0877,0.1134,0.0578,0.0557,0.0915,0.0844,0.0685


In [7]:
def tangency_weights(returns,dropna=True,scale_cov=1, name = 'Tangency'):
    if dropna:
        returns = returns.dropna()

    covmat_full = returns.cov()
    covmat_diag = np.diag(np.diag(covmat_full))
    covmat = scale_cov * covmat_full + (1-scale_cov) * covmat_diag

    weights = np.linalg.solve(covmat,returns.mean())
    weights = weights / weights.sum()
    tangency_weights_df = pd.DataFrame(weights, index=returns.columns, columns=[f'{name} Weights'])

    return tangency_weights_df
tangency_weights_df = tangency_weights(portfolios)
tangency_weights_df.head()

Unnamed: 0,Tangency Weights
Agric,0.032607
Food,0.068627
Soda,0.099568
Beer,0.211894
Smoke,0.295367


In [43]:
tangency_weights_df = tangency_weights(factors)
tangency_weights_df

Unnamed: 0,Tangency Weights
MKT,0.20976
SMB,0.077337
HML,-0.042142
RMW,0.313263
CMA,0.338982
UMD,0.102798


Tangency weights for the different factors in the model. 
- CMA has high weights while HML is virtually 0 -> means not very important 

#### NOTE: the factors with the highest annualized sharpe ratios = MKT, RMW, then CMA, UMD

- In the above model, CMA has the highest weights but yet not the highest sharpe ratio and yet MKT with the highest SR does not have the highest weights. This goes back to the fact that during MV optimization, not just about mean returns but also on the covariance

In [37]:
# compare with their metrics

annualization_factor = 12
def portfolio_metrics_basic(returns: pd.DataFrame, annualization_factor: float) -> pd.DataFrame: 
    '''
    Returns Annualized ER, Vol, SR, 
    '''
    
    returns_df = pd.DataFrame(index = returns.columns) 
    mu = returns.mean() * annualization_factor
    vol = returns.std() * np.sqrt(annualization_factor)
    sr = mu / vol

    
    returns_df['Annnualized Return'] = mu.values 
    returns_df['Annnualized Vol'] = vol.values 
    returns_df['Annnualized SR'] = sr.values
    
    return returns_df

portfolio_metrics_basic(factors,annualization_factor)



Unnamed: 0,Annnualized Return,Annnualized Vol,Annnualized SR
MKT,0.086277,0.156904,0.549872
SMB,0.008319,0.101873,0.081665
HML,0.025809,0.109999,0.234629
RMW,0.047096,0.083213,0.565962
CMA,0.029537,0.073084,0.404148
UMD,0.062709,0.154564,0.405714


# Time Series Regression on EAch Model

eg. AQR Model: 
- regress the expected returns from the portfolio against the factors in the model

For any pricing model (time series test), 
- alpha should be statistically 0, i.e. small. since based on CAPM, all the risk premium / excess returns of any asset should be assocaited with teh risk of the price of factors
- if alpha is not 0, means there is ex ess returns not associated with the pricing errors
- MAE should also ideally be 0 as the pricing model should be able to capture all systematic risk.
- eg. if MAE = 2.3%, this means that on average, assets have taken 2.3% excess returns uncorrelated with any of the pricing factors

#### Add intercept for the factor models -> since alpha should ideally be small and statistically insignificant
- factors = FF3, FF5, AQR etc
- dependent variable is the industry / portfolio etc

In [61]:
def factor_model(portfolio: pd.DataFrame, factors_used: list, factors_data: pd.DataFrame, name: str) -> pd.DataFrame: 
    
    summary = pd.DataFrame(0, index=portfolios.columns, columns = [f'{name} Alpha', f'{name} R Square'])

    for asset in portfolios.columns:
        asset_ret = portfolios[asset]
        
        x = factors_data[factors_used]
        model = sm.OLS(asset_ret, x).fit()
        summary.loc[asset, f'{name} Alpha'] = model.params[0]
        summary.loc[asset, f'{name} R Square'] = model.rsquared

    
    return summary 
    
summary = factor_model(portfolios, list(aqr.columns), aqr, 'AQR')     

  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} R Square'] = model.rsquared
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.

In [62]:
summary.head()

Unnamed: 0,AQR Alpha,AQR R Square
Agric,0.000643,0.339175
Food,0.000579,0.464221
Soda,0.001436,0.308354
Beer,0.001422,0.420762
Smoke,0.002942,0.273489


In [64]:
# for each of the model 

from collections import defaultdict
from itertools import product
import statsmodels.api as sm

capm = sm.add_constant(factors['MKT'])
aqr = sm.add_constant(factors[['MKT', 'HML', 'RMW', 'UMD']])
ff3 = sm.add_constant(factors[['MKT', 'SMB', 'HML']])
ff5 = sm.add_constant(factors[['MKT', 'SMB', 'HML', 'RMW', 'CMA']])


col_names = product(['CAPM', 'AQR', 'FF3', 'FF5'], ['Alpha', 'R Square'])
col_names = list(map(lambda x: ' '.join(x), col_names))

summary = pd.DataFrame(0, index=portfolios.columns, columns=col_names)

models = [*zip(['CAPM', 'AQR', 'FF3', 'FF5'], [capm, aqr, ff3, ff5])]


for asset in portfolios.columns:
    asset_ret = portfolios[asset]
    for name, X in models:
        model = sm.OLS(asset_ret, X).fit()
        summary.loc[asset, f'{name} Alpha'] = model.params[0]
        summary.loc[asset, f'{name} R Square'] = model.rsquared
        



  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} R Square'] = model.rsquared
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} R Square'] = model.rsquared
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} R Square'] = model.rsquared
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} R Square'] = model.rsquared
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = model.params[0]
  summary.loc[asset, f'{name} Alpha'] = 

In [65]:
summary.columns

Index(['CAPM Alpha', 'CAPM R Square', 'AQR Alpha', 'AQR R Square', 'FF3 Alpha',
       'FF3 R Square', 'FF5 Alpha', 'FF5 R Square'],
      dtype='object')

### Metrics for Factor Models

#### MAE

In [66]:
def calc_mae(data, annualization_factor): 
    return data.loc[:,data.columns.str.contains('Alpha')].abs().mean().to_frame('MAE') * annualization_factor 

calc_mae(summary, annualization_factor)

Unnamed: 0,MAE
CAPM Alpha,0.020402
AQR Alpha,0.022993
FF3 Alpha,0.023984
FF5 Alpha,0.031367


- A high MAE of 2% annualized means that they do not follow the pricing model.
- in fact, all the models perform worse than CAPM -> suggeting that the only important factor is MKT. However, AQR performed second best which indicates that maybe momentum factor should be added

#### R Square

In [67]:
summary.columns.str.contains('R Square')

array([False,  True, False,  True, False,  True, False,  True])

In [70]:
def calc_r_2(data, annualization_factor): 
    return data.loc[:,data.columns.str.contains('R Square')].mean().to_frame('R^2') 

calc_r_2(summary, annualization_factor)

Unnamed: 0,R^2
CAPM R Square,0.526107
AQR R Square,0.574935
FF3 R Square,0.571484
FF5 R Square,0.595951


- only explain about 50 - 60% of variation. low explainability of excess returns by decomposition on different factor models

# Cross Sectional Regression on the Models - issue with the cross sectional regression function 

In [83]:
def calc_pricing_regression(rets, factors, annualization_factor, intercept=True):
    # summary to store the betas and model statistics 
    
    if intercept:
        factors = sm.add_constant(factors)
    summary = {f'{k} Beta': [] for k in factors.columns[1:]}
    summary['Alpha']  = []
    summary['R Square']    = []
    for asset in rets.columns:
        model = sm.OLS(rets[asset], factors).fit()
        for k, _ in summary.items():
            try:
                summary[k].append(model.params[k[:-5]])
            except:
                pass
        summary['R Square'].append(model.rsquared)
        summary['Alpha'].append(model.params[0] * annualization_factor)
            
    return pd.DataFrame(summary, index=rets.columns)

def calc_cross_sectional_regression(mean_rets, betas, intercept=False):
    # dependent variable = mean returns of the portfolios 
    # independent variables = betas obtained from the t-s regressions (these betas are from each regression of the aset against the factors)
    if intercept:
        betas = sm.add_constant(betas)
    model = sm.OLS(mean_rets, betas).fit()
    params = model.params.to_frame('Cross Sectional Regression')
    params.loc['R Square'] = model.rsquared
    params.loc['MAE'] = model.resid.abs().mean() 
    return params

frames = []

ts_premia = portfolios.mean() * annualization_factor

for name, X in zip(['CAPM', 'AQR', 'FF3', 'FF5'], [capm, aqr, ff3, ff5]):
    '''
    1. T-s regression of each factor model between the portfolios and the factors 
    2. X sectional regression by regressing the averae portfolio returns (ts_premia) on the betas from the Ts regressions which are the factors 
    '''
    pricing_regr = calc_pricing_regression(portfolios, X, annualization_factor, intercept=False)
    res = calc_cross_sectional_regression(ts_premia, pricing_regr.iloc[:, :-2], intercept=False)
    print(res)
    res = res.rename({'Cross Sectional Regression': f'{name} CS'}, axis=1)
    frames.append(res)

summary_cs = pd.concat(frames, axis=1).fillna('')

# Move R^2 and MAE to the top
summary_cs = pd.concat([summary_cs.loc[['R Square', 'MAE']], summary_cs.drop(['R Square', 'MAE'])])
summary_cs_betas = summary_cs[summary_cs.index.str.contains('Beta')].copy()
summary_cs_betas.loc[:, 'Time Series'] = factors.mean().values * 12
summary_cs_betas

  summary['Alpha'].append(model.params[0] * annualization_factor)
  summary['Alpha'].append(model.params[0] * annualization_factor)
  summary['Alpha'].append(model.params[0] * annualization_factor)
  summary['Alpha'].append(model.params[0] * annualization_factor)


          Cross Sectional Regression
MKT Beta                    0.086096
R Square                    0.913169
MAE                         0.020384
          Cross Sectional Regression
MKT Beta                    0.089125
HML Beta                   -0.038711
RMW Beta                    0.043748
UMD Beta                    0.060747
R Square                    0.953710
MAE                         0.016456
          Cross Sectional Regression
MKT Beta                    0.102214
SMB Beta                   -0.064485
HML Beta                   -0.015204
R Square                    0.957895
MAE                         0.015113
          Cross Sectional Regression
MKT Beta                    0.096486
SMB Beta                   -0.056991
HML Beta                   -0.030994
RMW Beta                    0.033605
CMA Beta                   -0.011559
R Square                    0.969085
MAE                         0.013021


Unnamed: 0,CAPM CS,AQR CS,FF3 CS,FF5 CS,Time Series
MKT Beta,0.086096,0.089125,0.102214,0.096486,0.086277
HML Beta,,-0.038711,-0.015204,-0.030994,0.008319
RMW Beta,,0.043748,,0.033605,0.025809
UMD Beta,,0.060747,,,0.047096
SMB Beta,,,-0.064485,-0.056991,0.029537
CMA Beta,,,,-0.011559,0.062709


In [87]:
summary_cs

Unnamed: 0,CAPM CS,AQR CS,FF3 CS,FF5 CS
R Square,0.913169,0.95371,0.957895,0.969085
MAE,0.020384,0.016456,0.015113,0.013021
MKT Beta,0.086096,0.089125,0.102214,0.096486
HML Beta,,-0.038711,-0.015204,-0.030994
RMW Beta,,0.043748,,0.033605
UMD Beta,,0.060747,,
SMB Beta,,,-0.064485,-0.056991
CMA Beta,,,,-0.011559


- compare the ebtas for each of the factor model
- eg. HML beta, SMB Beta, CMA beta -> alot of the estimated premia are negative 

In [91]:
# for TS Regressions
calc_mae(summary, annualization_factor)

Unnamed: 0,MAE
CAPM Alpha,0.020402
AQR Alpha,0.022993
FF3 Alpha,0.023984
FF5 Alpha,0.031367


In [92]:
# for CS Regressions 
summary_cs.loc[['MAE']].T


Unnamed: 0,MAE
CAPM CS,0.020384
AQR CS,0.016456
FF3 CS,0.015113
FF5 CS,0.013021


#### Implications of Cross-Sectional MAE being Lower:

##### Focus on Cross-Sectional Analysis: The model is likely better suited for applications that rely on understanding the differences in average returns across assets, such as asset pricing or portfolio construction based on risk premiums.
##### Less Reliable for Time-Series Forecasting: If you're looking to use this model for predicting future returns or understanding time-varying dynamics (e.g., tactical asset allocation), the higher time-series MAE suggests it may be less effective. It might not capture ##### market timing or short-term fluctuations as well.
##### Factor Effectiveness: This outcome can indicate that the factors in the model are stronger explanatory variables in a cross-sectional context (explaining why some assets have higher or lower average returns) than in a time-series context (explaining fluctuations within each asset's return over time).
##### Example Context:
For instance, the Fama-French 5-Factor (FF5) model often performs well in cross-sectional tests because it captures broad asset characteristics (like size, value, and profitability) that distinguish returns across different stocks. However, it might not track each stock’s return changes over time as accurately, because those factors don't capture all time-specific risks or behaviors affecting returns day-to-day or month-to-month.