# Cross-sectional regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

hml = pd.read_csv('../data/hml.csv', index_col=0, parse_dates=True).round(6).dropna()

# sample data
sample = hml.iloc[:, 1:4].copy()
sample.loc["20050101":"20050131"]

Unnamed: 0,alpha001,alpha002,alpha003
2005-01-03,-0.002228,0.000484,0.003297
2005-01-04,-0.000182,0.001923,0.002211
2005-01-05,0.004365,0.002734,-0.00745
2005-01-06,0.000442,0.001133,-0.002207
2005-01-07,-0.00201,-0.002004,0.002415
2005-01-10,0.001498,-0.00207,9e-05
2005-01-11,0.001511,-0.00097,-0.003027
2005-01-12,-0.002956,-0.001916,-0.002593
2005-01-13,-0.001336,-0.000933,-0.000841
2005-01-14,0.000305,0.002421,-0.001603


## alphas exploration

### T test for alphas

Check if the alphas are significantly different from zero using a t-test.

In [10]:
alphas_t_test_results = pd.DataFrame(index=hml.columns, columns=['t-stat', 'p-value', '<=0.05'])
for col in hml.columns:
    # skip AAPL
    if col == 'AAPL':
        continue
    if col == 'eqw':
        continue
    # t test
    t_stat, p_value = stats.ttest_1samp(hml[col], 0)
    alphas_t_test_results.loc[col, 't-stat'] = t_stat  # type: ignore
    alphas_t_test_results.loc[col, 'p-value'] = p_value  # type: ignore

# t-test results
alphas_t_test_results = alphas_t_test_results.astype(float)
alphas_t_test_results = alphas_t_test_results.round(3)
alphas_t_test_results['<=0.05'] = alphas_t_test_results['p-value'] <= 0.05


print(alphas_t_test_results[alphas_t_test_results['<=0.05']].shape[0])
alphas_t_test_results[alphas_t_test_results['<=0.05']].sort_values('p-value', ascending=True)

28


Unnamed: 0,t-stat,p-value,<=0.05
alpha006,-4.888,0.0,True
alpha051,-12.422,0.0,True
alpha049,-14.29,0.0,True
alpha046,-10.558,0.0,True
alpha042,-5.46,0.0,True
alpha040,-4.786,0.0,True
alpha038,-20.212,0.0,True
alpha035,5.078,0.0,True
alpha034,-15.396,0.0,True
alpha033,-26.1,0.0,True


In [12]:
# selecting the significant alphas
significant_alphas = alphas_t_test_results[alphas_t_test_results['<=0.05']].index

with open("../data/significant_alphas_list.txt", "w") as f:
    for alpha in significant_alphas:
        f.write(f"{alpha}\n")


significant_alphas = hml[significant_alphas]


### Correlation matrix

Check the correlation between alphas and other variables.

In [4]:
significant_alphas.corr().style.background_gradient(cmap='coolwarm', axis=None)  # type: ignore

Unnamed: 0,alpha006,alpha008,alpha009,alpha010,alpha012,alpha013,alpha014,alpha016,alpha017,alpha018,alpha020,alpha022,alpha025,alpha028,alpha030,alpha033,alpha034,alpha035,alpha038,alpha040,alpha042,alpha046,alpha047,alpha049,alpha051,alpha055,alpha060,alpha101
alpha006,1.0,0.020835,0.023014,0.022877,0.033809,0.041663,0.065161,0.080432,0.050501,0.002522,0.047033,0.015191,0.068367,0.007938,0.00706,0.012104,0.003689,0.006634,-0.007234,0.021363,-0.018786,-0.023667,0.003067,0.021085,0.008419,-0.014054,0.020449,-0.041293
alpha008,0.020835,1.0,0.03414,0.040921,0.022419,-0.009308,-0.025904,-0.018806,0.063815,0.019251,0.009523,0.016093,0.052633,-0.033322,0.021902,0.016207,-0.007283,-0.03861,0.004962,0.013515,0.022602,0.005622,-0.014212,0.014866,0.024191,-0.017741,0.017316,0.018538
alpha009,0.023014,0.03414,1.0,0.231815,0.062989,0.023091,0.004632,0.043103,0.002698,0.020074,0.051238,-0.015472,-0.022325,0.040438,-0.015855,0.025268,0.017398,-0.034786,0.007446,-0.002974,0.015822,0.034933,-0.018137,0.066981,0.043343,0.024047,0.002717,-0.0158
alpha010,0.022877,0.040921,0.231815,1.0,0.030796,-0.000208,-0.003515,0.02885,0.018818,0.021041,0.001707,-0.026327,-0.015866,0.024914,0.012023,0.023961,0.008076,-0.027261,0.022991,0.052011,0.005275,0.033547,-0.004496,0.081145,0.04043,-0.000229,0.029319,-0.019869
alpha012,0.033809,0.022419,0.062989,0.030796,1.0,0.053619,-0.001033,0.022349,0.038402,0.022494,0.022445,0.000635,0.016885,-0.003641,0.023749,0.020706,0.001618,0.011584,0.004569,-0.006898,0.004016,-0.020964,0.01697,0.033851,0.045933,0.00826,0.019428,0.030243
alpha013,0.041663,-0.009308,0.023091,-0.000208,0.053619,1.0,-0.003361,0.036213,0.021227,-0.021453,0.001553,0.006596,0.039208,0.018904,0.005738,0.005854,0.025141,-0.021418,0.019282,0.017618,0.017472,0.006303,-0.010788,-0.024868,-0.002968,-0.012912,-0.040612,0.017413
alpha014,0.065161,-0.025904,0.004632,-0.003515,-0.001033,-0.003361,1.0,0.018869,0.038636,0.024114,0.033714,-0.018003,0.042749,-0.021594,-0.024393,0.012873,-0.026399,0.035032,0.022115,0.030571,0.023604,0.004945,-0.037127,0.005975,0.027372,0.010509,0.003045,0.003355
alpha016,0.080432,-0.018806,0.043103,0.02885,0.022349,0.036213,0.018869,1.0,0.064076,0.027259,0.013589,0.020653,0.013907,-0.027745,0.025847,0.048609,-0.002893,0.009313,0.063528,0.003122,0.023284,-0.000803,0.002777,0.026374,0.032289,0.009756,0.019137,0.025314
alpha017,0.050501,0.063815,0.002698,0.018818,0.038402,0.021227,0.038636,0.064076,1.0,0.069349,0.032925,0.037079,0.062411,0.015632,0.006865,0.064605,0.008643,-0.003317,0.051411,0.022591,0.055769,0.02387,0.02302,0.028731,0.054173,0.01145,0.030896,0.023556
alpha018,0.002522,0.019251,0.020074,0.021041,0.022494,-0.021453,0.024114,0.027259,0.069349,1.0,0.035076,0.015814,0.046108,0.016746,0.016736,0.049877,-0.006272,0.022292,0.028595,0.00534,0.058525,0.005001,0.047338,0.017496,0.035379,0.015596,0.061881,-0.009115


In [None]:
# average absolute correlation 
corr = significant_alphas.corr()  # type: ignore
# abs correlation
corr = corr.abs()
# remove the diagonal
np.fill_diagonal(corr.values, np.nan)
# average correlation
avg_corr = corr.mean().mean()
avg_corr = avg_corr.round(3)  # type: ignore
# average abs correlation, max abs correlation
float(avg_corr), float(corr.max().max())

(0.024, 0.2318147327820299)

### Regression AAPL to significant alphas and ewp

A simple linear regression on the AAPL stock to see if the alphas and equally weighted portfolio (ewp) is significant.

In [6]:
aapl = hml['AAPL'].copy()
ewp = hml['eqw'].copy()


X = pd.concat([significant_alphas, ewp], axis=1, join='inner').sort_index()  # type: ignore
X = sm.add_constant(X)
model = sm.OLS(aapl, X).fit()
model.summary()


0,1,2,3
Dep. Variable:,AAPL,R-squared:,0.336
Model:,OLS,Adj. R-squared:,0.332
Method:,Least Squares,F-statistic:,91.73
Date:,"Tue, 22 Apr 2025",Prob (F-statistic):,0.0
Time:,09:04:52,Log-Likelihood:,14119.0
No. Observations:,5285,AIC:,-28180.0
Df Residuals:,5255,BIC:,-27980.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0008,0.000,2.748,0.006,0.000,0.001
alpha006,-0.0728,0.087,-0.841,0.400,-0.243,0.097
alpha008,0.1266,0.092,1.374,0.169,-0.054,0.307
alpha009,0.1151,0.122,0.944,0.345,-0.124,0.354
alpha010,0.0801,0.114,0.703,0.482,-0.143,0.304
alpha012,0.0085,0.099,0.086,0.932,-0.186,0.203
alpha013,0.0241,0.090,0.269,0.788,-0.152,0.200
alpha014,-0.0798,0.098,-0.816,0.414,-0.272,0.112
alpha016,-0.0994,0.090,-1.110,0.267,-0.275,0.076

0,1,2,3
Omnibus:,890.202,Durbin-Watson:,1.926
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10527.698
Skew:,0.436,Prob(JB):,0.0
Kurtosis:,9.859,Cond. No.,617.0


## Rolling t-test for alphas

Each month end, we will check if the alphas are significantly different from zero using a t-test.

In [7]:
month_ends = pd.date_range(start=hml.index[0], end=hml.index[-1], freq='BME').to_list(  )
alphas_returns = hml.drop(columns=['eqw', 'AAPL']).copy()

rolling_result_p = pd.DataFrame(index=month_ends, columns=alphas_returns.columns)
rolling_result_bool = pd.DataFrame(index=month_ends, columns=alphas_returns.columns)
for i, month_end in enumerate(month_ends):
    if i == 0:
        continue

    previous_month_end = month_ends[i - 1]
    # get the data for the month
    month_data = alphas_returns.loc[previous_month_end:month_end]
    # t-test 
    for col in month_data.columns:
        # t test
        t_stat, p_value = stats.ttest_1samp(month_data[col], 0)
        p_value = float(round(p_value, 4))  # type: ignore
        rolling_result_p.loc[month_end, col] = p_value
        rolling_result_bool.loc[month_end, col] = p_value <= 0.05

In [8]:
rolling_result_p.to_csv('../data/rolling_result_p.csv')