# Part 1: Returns, Risk, and Factors – Replication

module import

In [1]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from scipy.stats import f

according to teacher's email, we can use monthly data instead of daily data to do the regress in the sake of convenience.

In each datasheet, the meaning of conlumn names in the sheet can be found at the according txt file, for example, the meaning of "STK_MKT_FIVEFACMONTH.xlsx" can be found in "STK_MKT_FIVEFACMONTH\[DES\]\[xlsx\].txt".

In [2]:
#  check the current working directory
os.getcwd()

'/mnt/e/MyPythonProject/investment_project'

# 1.1 Import the factor_data

In [3]:
factor_data = pd.read_excel(
    'source_data/5-Factor Model Index (Monthly)/STK_MKT_FIVEFACMONTH.xlsx')  # warning means no error, let's just ignore and continue!

  warn("Workbook contains no default style, apply openpyxl's default")


clean the data: **factor_data**, during which we delete the unnecessary rows and columns, and rename the columns.

according to the definition of Portfolios, we only use the ones marked as 1.

note: Portfolios [Portfolio Type] - 1 represents 2\*3 portfolio division method; 2 represents 2\*2 portfolio division method; 3 represents 2\*2\*2\*2 portfolio division method.

In [4]:
factor_data = factor_data[
    ['MarkettypeID', 'TradingMonth', 'Portfolios', 'RiskPremium2', 'SMB2', 'HML2', 'RMW2', 'CMA2']]
factor_data.columns = ['MarkettypeID', 'date', 'portfolios', 'risk_premium', 'smb', 'hml', 'rmw', 'cma']
factor_data = factor_data.iloc[2:, :]

In [5]:
factor_data = factor_data[factor_data['portfolios'] == 1]
factor_data = factor_data[
    factor_data['MarkettypeID'] == "P9706"]  # P9706: SSE-SZSE A share market (excluding STAR Market, ChiNext)
factor_data.reset_index(drop=True, inplace=True)  # reset the index to 0,1,2,...

In [6]:
factor_data.head(5)  # check the first 5 rows

Unnamed: 0,MarkettypeID,date,portfolios,risk_premium,smb,hml,rmw,cma
0,P9706,2000-01,1,0.135225,-0.005175,-0.104151,0.042289,-0.076779
1,P9706,2000-02,1,0.113951,0.032327,-0.002393,-0.011365,0.03855
2,P9706,2000-03,1,0.058133,0.069624,0.01608,-0.051447,0.065904
3,P9706,2000-04,1,0.015578,-0.010779,0.02358,-0.022984,0.00972
4,P9706,2000-05,1,0.027197,0.025777,0.025355,-0.00662,-0.005409


In [7]:
factor_data.tail(5)  # check the last 5 rows

Unnamed: 0,MarkettypeID,date,portfolios,risk_premium,smb,hml,rmw,cma
284,P9706,2023-09,1,-0.002334,0.0091,0.015482,-0.00497,-0.003365
285,P9706,2023-10,1,-0.02938,0.026882,-0.009619,-0.016592,-0.003727
286,P9706,2023-11,1,-0.00157,0.052495,-0.008304,-0.021795,0.019977
287,P9706,2023-12,1,-0.015435,0.005892,0.002136,0.019856,-0.014981
288,P9706,2024-01,1,-0.057066,-0.108162,0.108981,-0.003561,-0.007933


# 1.2 Import the return data of each stock

In [8]:
stock_return = pd.read_excel(
    'source_data/Monthly Stock Price Returns/TRD_Mnth.xlsx')  # this is time-consuming, taking me about 41 seconds to load the data

  warn("Workbook contains no default style, apply openpyxl's default")


In [9]:
stock_return = stock_return[['Stkcd', 'Trdmnt',
                             'Mretnd']]  # calculate the return of each stock, which is the monthly return without cash dividend reinvested of each stock
stock_return = stock_return.iloc[2:, :]

In [10]:
stock_return.head(5)  # check the first 5 rows

Unnamed: 0,Stkcd,Trdmnt,Mretnd
2,1,2000-01,0.061891
3,1,2000-02,-0.011333
4,1,2000-03,0.002729
5,1,2000-04,0.037017
6,1,2000-05,-0.055118


In [11]:
stock_return.shape  # check the shape of the data

(730499, 3)

# calculate the market risk premium

note that whatever method we use, the calculation of market risk premium is always the same

In [12]:
# import and clean the market return data
mkt_return = pd.read_excel('source_data/万得全A.xlsx')  # load the market return data
mkt_return = mkt_return.iloc[3:, :]
mkt_return.columns = ['date', 'mkt_index']
mkt_return['date'] = mkt_return['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
mkt_return['date'] = pd.to_datetime(mkt_return['date'])
mkt_return.reset_index(drop=True, inplace=True)
mkt_return.head(5)

Unnamed: 0,date,mkt_index
0,1994-12-30,379.3402
1,1995-01-03,375.9952
2,1995-01-04,385.7705
3,1995-01-05,381.6151
4,1995-01-06,379.3633


In [13]:
mkt_return['daily_mkt_return'] = mkt_return['mkt_index'].pct_change()  # get the market index return
start_time = pd.to_datetime("2000-01-01")
start_time = np.array(start_time, dtype=np.datetime64)
end_time = np.array(pd.to_datetime("2024-01-01"), dtype=np.datetime64)
mkt_return = mkt_return[
    (mkt_return['date'] >= start_time) & (mkt_return['date'] < end_time)]  # get data in target time range
mkt_return.head(5)

  mkt_return['daily_mkt_return'] = mkt_return['mkt_index'].pct_change()  # get the market index return


Unnamed: 0,date,mkt_index,daily_mkt_return
1227,2000-01-04,1031.0334,0.031033
1228,2000-01-05,1034.4385,0.003303
1229,2000-01-06,1076.512,0.040673
1230,2000-01-07,1117.7781,0.038333
1231,2000-01-10,1141.0309,0.020803


In [14]:
# import and clean the risk-free rate data
rf = pd.read_excel('source_data/Risk-Free Rate/TRD_Nrrate.xlsx')  # load the risk-free rate data
rf = rf.iloc[2:, :]
rf = rf[['Clsdt', 'Nrrdaydt','Nrrdata']]
rf.columns = ['date', 'daily_rf','rf']
rf['date'] = pd.to_datetime(rf['date'])
rf['daily_rf'] = rf['daily_rf'] / 100  # change the unit of the risk-free rate to decimal
rf['rf'] = rf['rf'] / 100  # change the unit of the risk-free rate to decimal
rf.head(5)

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,date,daily_rf,rf
2,2000-01-01,6.1e-05,0.0225
3,2000-01-02,6.1e-05,0.0225
4,2000-01-03,6.1e-05,0.0225
5,2000-01-04,6.1e-05,0.0225
6,2000-01-05,6.1e-05,0.0225


In [15]:
rf['month'] = rf['date'].dt.to_period('M')
rf_monthly = rf.groupby('month').agg({'daily_rf': [lambda x: (1 + x).prod() - 1], 'rf': 'last'})
rf_monthly.reset_index(inplace=True)
rf_monthly.columns = ['month', 'monthly_rf','rf']
rf_monthly.head(5)

Unnamed: 0,month,monthly_rf,rf
0,2000-01,0.001893,0.0225
1,2000-02,0.001771,0.0225
2,2000-03,0.001893,0.0225
3,2000-04,0.001832,0.0225
4,2000-05,0.001893,0.0225


In [16]:
mkt_risk_premium = mkt_return.merge(rf, left_on='date', right_on='date',
                                    how='left')  # merge the market return data with the risk-free rate data
mkt_risk_premium['daily_mkt_risk_premium'] = mkt_risk_premium['daily_mkt_return'] - mkt_risk_premium[
    'rf']  # calculate the market risk premium
mkt_risk_premium.head(5)  # so we get the market risk premium factor in target time range

Unnamed: 0,date,mkt_index,daily_mkt_return,daily_rf,rf,month,daily_mkt_risk_premium
0,2000-01-04,1031.0334,0.031033,6.1e-05,0.0225,2000-01,0.008533
1,2000-01-05,1034.4385,0.003303,6.1e-05,0.0225,2000-01,-0.019197
2,2000-01-06,1076.512,0.040673,6.1e-05,0.0225,2000-01,0.018173
3,2000-01-07,1117.7781,0.038333,6.1e-05,0.0225,2000-01,0.015833
4,2000-01-10,1141.0309,0.020803,6.1e-05,0.0225,2000-01,-0.001697


In [17]:
# for we will use monthly data to replicate Fama-French 5-factor model, we need to calculate the monthly market risk premium
mkt_risk_premium['year_month'] = mkt_risk_premium['date'].dt.to_period('M')
mkt_risk_premium = mkt_risk_premium.groupby('year_month').agg(
    {'daily_mkt_risk_premium': [lambda x: (1 + x).prod() - 1]})
mkt_risk_premium.reset_index(inplace=True)
mkt_risk_premium.columns = ['month', 'mkt_risk_premium']

In [18]:
mkt_risk_premium.head(5)

Unnamed: 0,month,mkt_risk_premium
0,2000-01,-0.244185
1,2000-02,-0.14394
2,2000-03,-0.373996
3,2000-04,-0.35724
4,2000-05,-0.316809


# 1.3 solve the problem of point 3 in part 1

## 1.3.1 use variance to stand for the risk

the requirement in the pdf is "Report summary statistics of stock risk and returns“, so we just use the variance of the stock return as the risk of the stock.

In [19]:
stock_return_stat = stock_return.groupby('Stkcd').agg({'Mretnd': ['mean', 'var', 'min', 'max', 'std',
                                                                  lambda x: x.quantile(.25), lambda x: x.quantile(.5),
                                                                  lambda x: x.quantile(.75), 'skew', lambda
                                                                      x: x.kurt()]})  # calculate the mean and variance of the stock return
stock_return_stat.columns = ['mean', 'var', 'min', 'max', 'std', '25% quantile', '50% quantile', '75% quantile', 'skew',
                             'kurtosis']  # rename the columns

In [20]:
stock_return_stat.head(5)

Unnamed: 0_level_0,mean,var,min,max,std,25% quantile,50% quantile,75% quantile,skew,kurtosis
Stkcd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.009431,0.011582,-0.305195,0.37447,0.10762,-0.059481,0.004307,0.060023,0.47665,1.167765
2,0.018041,0.01589,-0.296407,0.621102,0.126057,-0.056359,0.004945,0.07886,1.34035,4.50727
3,-0.014095,0.028871,-0.475,0.281609,0.169914,-0.090354,-0.006667,0.089521,-0.728428,1.150957
4,0.013136,0.022394,-0.389978,0.783523,0.149648,-0.079136,0.00867,0.096106,0.640154,2.904307
5,0.010153,0.047893,-0.422857,2.858537,0.218845,-0.072539,-0.011299,0.05364,8.346999,105.749978


In [21]:
stock_return_stat.apply(
    'mean')  # calculate the mean of each column, which is the summary statistics of the whole market

mean            0.004765
var             0.028576
min            -0.333545
max             0.703519
std             0.153934
25% quantile   -0.078741
50% quantile   -0.008728
75% quantile    0.068607
skew            1.211271
kurtosis        7.506872
dtype: object

In [22]:
stock_return_stat.to_excel(r'output/part1/1.3 stock_return_stat.xlsx')  # save the result to an Excel file

variable "stock_return_stat" is the summary statistics of stock risk and returns: Stkcd means "stock code".

## 1.3.2 use the factor exposure to stand for the risk

if we use the according factor exposure to stand for the risk, denote $r_{it}$ as the return of stock i at time t, we should run the regression $$r_{it} = \alpha_i + \beta_{i1}f_{1t} + \beta_{i2}f_{2t} + \beta_{i3}f_{3t} + \beta_{i4}f_{4t} + \beta_{i5}f_{5t} + \epsilon_{it}, t=1,2,3,...,T$$ where $f_{1t}, f_{2t}, f_{3t}, f_{4t}, f_{5t}$ are the factors, $\beta_{i1}, \beta_{i2}, \beta_{i3}, \beta_{i4}, \beta_{i5}$ are the factor exposures, $\alpha_i$ is the alpha, and $\epsilon_{it}$ is the error term.

So we can get $\beta_{i1}, \beta_{i2}, \beta_{i3}, \beta_{i4}, \beta_{i5}$ as the risk of the stock. and finally we can have a table of the risk of each stock.

In [23]:
# regress each stock's return on the factors
tmp_return = stock_return.merge(factor_data, left_on='Trdmnt', right_on='date',
                                how='left')  # merge the stock return data with the factor data
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,MarkettypeID,date,portfolios,risk_premium,smb,hml,rmw,cma
0,1,2000-01,0.061891,P9706,2000-01,1,0.135225,-0.005175,-0.104151,0.042289,-0.076779
1,1,2000-02,-0.011333,P9706,2000-02,1,0.113951,0.032327,-0.002393,-0.011365,0.03855
2,1,2000-03,0.002729,P9706,2000-03,1,0.058133,0.069624,0.01608,-0.051447,0.065904
3,1,2000-04,0.037017,P9706,2000-04,1,0.015578,-0.010779,0.02358,-0.022984,0.00972
4,1,2000-05,-0.055118,P9706,2000-05,1,0.027197,0.025777,0.025355,-0.00662,-0.005409


In [24]:
tmp_return[tmp_return['Stkcd'] == '000001'].tail(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,MarkettypeID,date,portfolios,risk_premium,smb,hml,rmw,cma
282,1,2023-09,0.006289,P9706,2023-09,1,-0.002334,0.0091,0.015482,-0.00497,-0.003365
283,1,2023-10,-0.066071,P9706,2023-10,1,-0.02938,0.026882,-0.009619,-0.016592,-0.003727
284,1,2023-11,-0.07457,P9706,2023-11,1,-0.00157,0.052495,-0.008304,-0.021795,0.019977
285,1,2023-12,-0.029959,P9706,2023-12,1,-0.015435,0.005892,0.002136,0.019856,-0.014981
286,1,2024-01,0.007455,P9706,2024-01,1,-0.057066,-0.108162,0.108981,-0.003561,-0.007933


In [25]:
tmp_return[tmp_return['Stkcd'] == '000002'].tail(
    5)  # we can find that the result is the same as the one above, so our merge is correct

Unnamed: 0,Stkcd,Trdmnt,Mretnd,MarkettypeID,date,portfolios,risk_premium,smb,hml,rmw,cma
565,2,2023-09,-0.041056,P9706,2023-09,1,-0.002334,0.0091,0.015482,-0.00497,-0.003365
566,2,2023-10,-0.133792,P9706,2023-10,1,-0.02938,0.026882,-0.009619,-0.016592,-0.003727
567,2,2023-11,0.009709,P9706,2023-11,1,-0.00157,0.052495,-0.008304,-0.021795,0.019977
568,2,2023-12,-0.085664,P9706,2023-12,1,-0.015435,0.005892,0.002136,0.019856,-0.014981
569,2,2024-01,-0.082218,P9706,2024-01,1,-0.057066,-0.108162,0.108981,-0.003561,-0.007933


In [26]:
# tmp_return.drop(columns=['date', 'rf', 'month'], inplace=True)
tmp_return.drop(columns=['MarkettypeID', 'date', 'portfolios'], inplace=True)
tmp_return.head(10)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,risk_premium,smb,hml,rmw,cma
0,1,2000-01,0.061891,0.135225,-0.005175,-0.104151,0.042289,-0.076779
1,1,2000-02,-0.011333,0.113951,0.032327,-0.002393,-0.011365,0.03855
2,1,2000-03,0.002729,0.058133,0.069624,0.01608,-0.051447,0.065904
3,1,2000-04,0.037017,0.015578,-0.010779,0.02358,-0.022984,0.00972
4,1,2000-05,-0.055118,0.027197,0.025777,0.025355,-0.00662,-0.005409
5,1,2000-06,0.007222,0.022145,-0.026737,0.032628,0.002911,0.002796
6,1,2000-07,0.02096,0.044277,0.017068,0.03052,-0.002764,0.045901
7,1,2000-08,-0.041059,-0.008596,0.042098,-0.017277,-0.032858,0.013269
8,1,2000-09,-0.044507,-0.049773,0.028192,-0.034215,-0.003422,0.011796
9,1,2000-10,0.034788,0.023503,0.032889,0.013666,0.001559,0.00692


In [27]:
# tmp_return['Trdmnt'] = (tmp_return['Trdmnt']).astype(str)
tmp_return['Trdmnt'] = pd.to_datetime(tmp_return['Trdmnt']).dt.to_period('M')
tmp_return.shape

(730499, 8)

In [28]:
# add rf to tmp_return, this is a little time-consuming (1 min)
tmp_return['rf'] = tmp_return['Trdmnt'].apply(lambda x: rf_monthly[rf_monthly['month'] == x]['rf'].values[0])
tmp_return.shape

(730499, 9)

In [29]:
tmp_return['excess_return'] = tmp_return['Mretnd'] - tmp_return['rf']  # calculate the excess return
tmp_return.head(10)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,risk_premium,smb,hml,rmw,cma,rf,excess_return
0,1,2000-01,0.061891,0.135225,-0.005175,-0.104151,0.042289,-0.076779,0.0225,0.039391
1,1,2000-02,-0.011333,0.113951,0.032327,-0.002393,-0.011365,0.03855,0.0225,-0.033833
2,1,2000-03,0.002729,0.058133,0.069624,0.01608,-0.051447,0.065904,0.0225,-0.019771
3,1,2000-04,0.037017,0.015578,-0.010779,0.02358,-0.022984,0.00972,0.0225,0.014517
4,1,2000-05,-0.055118,0.027197,0.025777,0.025355,-0.00662,-0.005409,0.0225,-0.077618
5,1,2000-06,0.007222,0.022145,-0.026737,0.032628,0.002911,0.002796,0.0225,-0.015278
6,1,2000-07,0.02096,0.044277,0.017068,0.03052,-0.002764,0.045901,0.0225,-0.00154
7,1,2000-08,-0.041059,-0.008596,0.042098,-0.017277,-0.032858,0.013269,0.0225,-0.063559
8,1,2000-09,-0.044507,-0.049773,0.028192,-0.034215,-0.003422,0.011796,0.0225,-0.067007
9,1,2000-10,0.034788,0.023503,0.032889,0.013666,0.001559,0.00692,0.0225,0.012288


In [30]:
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,risk_premium,smb,hml,rmw,cma,rf,excess_return
0,1,2000-01,0.061891,0.135225,-0.005175,-0.104151,0.042289,-0.076779,0.0225,0.039391
1,1,2000-02,-0.011333,0.113951,0.032327,-0.002393,-0.011365,0.03855,0.0225,-0.033833
2,1,2000-03,0.002729,0.058133,0.069624,0.01608,-0.051447,0.065904,0.0225,-0.019771
3,1,2000-04,0.037017,0.015578,-0.010779,0.02358,-0.022984,0.00972,0.0225,0.014517
4,1,2000-05,-0.055118,0.027197,0.025777,0.025355,-0.00662,-0.005409,0.0225,-0.077618


In [31]:
# this cell costs me 6 seconds to run
def regress(data):
    X = data[['risk_premium', 'smb', 'hml', 'rmw', 'cma']]
    X = sm.add_constant(X)
    y = data['excess_return']
    model = sm.OLS(np.array(y.astype(float)), X.astype(float))
    result = model.fit()
    params_and_tvalues = np.append(result.params, result.tvalues)
    return params_and_tvalues


betas = (tmp_return.groupby('Stkcd').apply(regress)).apply(
    pd.Series)  # calculate the correlation between the stock return and the risk premium
betas.columns = ['const', 'risk_premium', 'smb', 'hml', 'rmw', 'cma', 't-const', 't-risk_premium', 't-smb', 't-hml',
                 't-rmw', 't-cma']

  betas = (tmp_return.groupby('Stkcd').apply(regress)).apply(


In [32]:
betas.head(5)

Unnamed: 0_level_0,const,risk_premium,smb,hml,rmw,cma,t-const,t-risk_premium,t-smb,t-hml,t-rmw,t-cma
Stkcd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,-0.014261,1.109029,-0.686645,0.184607,-0.5584,-0.314337,-3.410943,18.556582,-4.333584,1.118235,-2.127088,-1.143171
2,-0.008569,1.079687,-0.196118,0.194538,0.570821,-0.411982,-1.380179,11.949055,-0.83176,0.805877,1.478631,-1.024524
3,-0.083139,2.075144,-0.118815,-0.067786,-0.596165,1.560055,-2.436266,3.711818,-0.082617,-0.05661,-0.295534,0.825199
4,-0.02303,0.927457,1.465291,-0.042892,-0.486143,-0.772547,-3.420062,9.724815,5.775238,-0.163517,-1.166218,-1.775932
5,-0.021164,1.141539,0.912234,-1.561096,-0.30218,0.866282,-1.900396,7.26888,2.180282,-3.394862,-0.439964,1.182638


In [33]:
betas.to_excel(r'output/part1/1.3 betas.xlsx')  # save the result to an Excel file

variable "betas" is the risk of each stock: Stkcd means "stock code", const means the alpha, risk_premium means the risk premium, smb means the size factor, hml means the value factor, rmw means the profitability factor, cma means the investment factor.

# 1.4 replicate Fama-French 5-factor model

There're altogather 3 methods in the paper to construct the 5-factor model, we will construct sequentially.

No matter what method we use, the formula is always the same, 

$$R_{it}-R_{Ft}=a_i+b_i(R_{Mt}-R_{Ft})+s_iSMB_t+h_i HML_t+r_iRMW_t+c_iCMA_t+e_{it}$$

where $R_{it}$ is the return of stock i at time t, $R_{Ft}$ is the risk-free rate at time t, $R_{Mt}$ is the market return at time t, $SMB_t$ is the size factor at time t, $HML_t$ is the value factor at time t, $RMW_t$ is the profitability factor at time t, $CMA_t$ is the investment factor at time t, $a_i$ is the alpha of stock i, $b_i$ is the beta of the market return, $s_i$ is the beta of the size factor, $h_i$ is the beta of the value factor, $r_i$ is the beta of the profitability factor, $c_i$ is the beta of the investment factor, $e_{it}$ is the error term.

the construction process of the 5-factor model is as follows:(from fama-french 5-factor model.pdf, see it in [here](papers/Fama-French A five-factor asset pricing model.pdf)

![ff5.png](img/ff5.png)

Similarly, we need to calculate the monthly stock price returns from daily data.

In [34]:
"""
import daily stock price return data, and calculate the monthly stock price returns
because this step is really, really time-consuming, (about 13 min on i7-13700K with PCIE 4.0 SSD)
there's altogether over 14 million rows in the daily stock price return data, 
we only run this cell once, and then save the result to an Excel file in ./output/monthly_stock_return.xlsx 
"""
# daily_stock_return = pd.read_excel('source_data/Daily Stock Price Returns 2000-2004/TRD_Dalyr.xlsx')  # load the daily stock price return data
# daily_stock_return = daily_stock_return.iloc[2:,:]
# tmp1 = pd.read_excel('source_data/Daily Stock Price Returns 2000-2004/TRD_Dalyr1.xlsx')  
# tmp1 = tmp1.iloc[2:,:]
# tmp2 = pd.read_excel('source_data/Daily Stock Price Returns 2005-2009/TRD_Dalyr.xlsx')
# tmp2 = tmp2.iloc[2:,:]
# tmp3 = pd.read_excel('source_data/Daily Stock Price Returns 2005-2009/TRD_Dalyr1.xlsx')
# tmp3 = tmp3.iloc[2:,:]
# tmp4 = pd.read_excel('source_data/Daily Stock Price Returns 2010-2014/TRD_Dalyr.xlsx')
# tmp4 = tmp4.iloc[2:,:]
# tmp5 = pd.read_excel('source_data/Daily Stock Price Returns 2010-2014/TRD_Dalyr1.xlsx')
# tmp5 = tmp5.iloc[2:,:]
# tmp6 = pd.read_excel('source_data/Daily Stock Price Returns 2010-2014/TRD_Dalyr2.xlsx')
# tmp6 = tmp6.iloc[2:,:]
# tmp7 = pd.read_excel('source_data/Daily Stock Price Returns 2015-2019/TRD_Dalyr.xlsx')
# tmp7 = tmp7.iloc[2:,:]
# tmp8 = pd.read_excel('source_data/Daily Stock Price Returns 2015-2019/TRD_Dalyr1.xlsx')
# tmp8 = tmp8.iloc[2:,:]
# tmp9 = pd.read_excel('source_data/Daily Stock Price Returns 2015-2019/TRD_Dalyr2.xlsx')
# tmp9 = tmp9.iloc[2:,:]
# tmp10 = pd.read_excel('source_data/Daily Stock Price Returns 2015-2019/TRD_Dalyr3.xlsx')
# tmp10 = tmp10.iloc[2:,:]
# tmp11 = pd.read_excel('source_data/Daily Stock Price Returns 2020-2024/TRD_Dalyr.xlsx')
# tmp11 = tmp11.iloc[2:,:]
# tmp12 = pd.read_excel('source_data/Daily Stock Price Returns 2020-2024/TRD_Dalyr1.xlsx')
# tmp12 = tmp12.iloc[2:,:]
# tmp13 = pd.read_excel('source_data/Daily Stock Price Returns 2020-2024/TRD_Dalyr2.xlsx')
# tmp13 = tmp13.iloc[2:,:]
# tmp14 = pd.read_excel('source_data/Daily Stock Price Returns 2020-2024/TRD_Dalyr3.xlsx')
# tmp14 = tmp14.iloc[2:,:]
# tmp15 = pd.read_excel('source_data/Daily Stock Price Returns 2020-2024/TRD_Dalyr4.xlsx')
# tmp15 = tmp15.iloc[2:,:]
# 
# daily_stock_return = pd.concat([daily_stock_return, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15], axis=0)  # concatenate the two dataframes
# 
# daily_stock_return.reset_index(drop=True, inplace=True)
# daily_stock_return.to_feather(r'temp/daily_stock_return.feather')  # save the result to a feather file, which is much more quickly than csv or xlsx
# 
# del tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15  # delete the temporary dataframes

"\nimport daily stock price return data, and calculate the monthly stock price returns\nbecause this step is really, really time-consuming, (about 13 min on i7-13700K with PCIE 4.0 SSD)\nthere's altogether over 14 million rows in the daily stock price return data, \nwe only run this cell once, and then save the result to an Excel file in ./output/monthly_stock_return.xlsx \n"

In [35]:
daily_stock_return = pd.read_feather(r'temp/daily_stock_return.feather')  # load the monthly stock price return data

In [36]:
daily_stock_return = daily_stock_return[['Stkcd', 'Trddt', 'Dsmvtll', 'Dretwd']]
daily_stock_return.columns = ['Stkcd', 'date', 'market_value', 'daily_stock_return']
daily_stock_return['date'] = pd.to_datetime(daily_stock_return['date'])

In [37]:
daily_stock_return.head(20)

Unnamed: 0,Stkcd,date,market_value,daily_stock_return
0,1,2000-01-04,28383283.31,0.048138
1,1,2000-01-05,28026358.48,-0.012575
2,1,2000-01-06,29143688.39,0.039867
3,1,2000-01-07,30323092.18,0.040469
4,1,2000-01-10,31254200.43,0.030706
5,1,2000-01-11,29485094.75,-0.056604
6,1,2000-01-12,28305690.96,-0.04
7,1,2000-01-13,28150506.25,-0.005482
8,1,2000-01-14,27747026.0,-0.014333
9,1,2000-01-17,28057395.42,0.011186


## 1.4.1 Method 1: Construct the 5-factor model using 2$\times$3 portfolio division method

In [38]:
daily_stock_return['month'] = daily_stock_return['date'].dt.to_period('M')
daily_stock_return = daily_stock_return[['Stkcd', 'date', 'month', 'daily_stock_return', 'market_value']]

In [39]:
# calculate monthly return for each stock
"""
this cell is a little time-consuming, taking me about 30 seconds to run with i7-13700K, so I just run it once and save the result to an Excel file in ./temp/monthly_stock_return.feather
"""
# monthly_stock_return = daily_stock_return.groupby(['Stkcd', 'month']).agg({'daily_stock_return': lambda x: (1+x).prod()-1, 'market_value':'last'})  # calculate the monthly return for each stock
# monthly_stock_return.columns = ['monthly_stock_return', 'market_value']
# monthly_stock_return.reset_index(inplace=True)
# monthly_stock_return.to_feather(r'temp/monthly_stock_return.feather')  # save the result to a feather file

'\nthis cell is a little time-consuming, taking me about 30 seconds to run with i7-13700K, so I just run it once and save the result to an Excel file in ./temp/monthly_stock_return.feather\n'

In [40]:
monthly_stock_return = pd.read_feather(r'temp/monthly_stock_return.feather')  # load the monthly stock price return data

### now we will give a tag on S on B, according to the market value of the stock.

In [41]:
def small_or_big(x):
    return pd.qcut(x, 2, labels=['S', 'B'])


monthly_stock_return['S or B'] = monthly_stock_return.groupby("month")['market_value'].transform(
    small_or_big)  # divide the stocks into 2 groups according to the market value
monthly_stock_return['market_value'] = monthly_stock_return[
                                           'market_value'] * 1000  # change the unit of the market value to yuan
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,S or B
0,1,2000-01,0.061891,28755730000.0,B
1,1,2000-02,-0.011333,28429840000.0,B
2,1,2000-03,0.00273,28507430000.0,B
3,1,2000-04,0.037016,29562690000.0,B
4,1,2000-05,-0.055116,27933250000.0,B
5,1,2000-06,0.00722,28134990000.0,B
6,1,2000-07,0.020961,28724690000.0,B
7,1,2000-08,-0.041059,27545290000.0,B
8,1,2000-09,-0.044508,26319330000.0,B
9,1,2000-10,0.034786,27234920000.0,B


### now we will give a tag on H, N or L, according to the book-to-market ratio of the stock.

In [42]:
# balance_sheet = pd.read_excel('source_data/Balance Sheet/FS_Combas.xlsx', skiprows=1)  # load the balance sheet data, using 28s
# balance_sheet.head(5)
# balance_sheet.to_csv(r'source_data/Balance Sheet/balance_sheet.csv')  # save the result to a csv file for quicker loading

In [43]:
balance_sheet = pd.read_csv(r'source_data/Balance Sheet/balance_sheet.csv')  # load the balance sheet data
balance_sheet = balance_sheet[balance_sheet['Statement Type'] == "A"]
balance_sheet.head(5)

  balance_sheet = pd.read_csv(r'source_data/Balance Sheet/balance_sheet.csv')  # load the balance sheet data


Unnamed: 0.1,Unnamed: 0,Stock Code,Stock Short Name,Ending Date of Statistics,Statement Type,Total Current Assets,Total Assets,Total Shareholders’ Equity
1,1,1,SFZA,2000-01-01,A,,43912394151,1141603885
2,2,1,SFZA,2000-06-30,A,,49732336516,3078512556
3,3,1,SFZA,2000-12-31,A,,67227499769,4738883655
4,4,1,SFZA,2001-01-01,A,,66006167607,3517551493
5,5,1,SFZA,2001-06-30,A,,85181426762,4961824149


In [44]:
balance_sheet = balance_sheet[['Stock Code', 'Ending Date of Statistics', 'Total Assets', 'Total Shareholders’ Equity']]
balance_sheet.columns = ['Stkcd', 'date', 'total_assets', 'total_shareholders_equity']
balance_sheet = balance_sheet.iloc[1:, :]
balance_sheet.head(5)

Unnamed: 0,Stkcd,date,total_assets,total_shareholders_equity
2,1,2000-06-30,49732336516,3078512556
3,1,2000-12-31,67227499769,4738883655
4,1,2001-01-01,66006167607,3517551493
5,1,2001-06-30,85181426762,4961824149
6,1,2001-12-31,120126983351,3627668792


In [45]:
balance_sheet['date'] = pd.to_datetime(balance_sheet['date'])
balance_sheet['month'] = balance_sheet['date'].dt.to_period('M')
balance_sheet = balance_sheet[['Stkcd', 'month', 'total_assets', 'total_shareholders_equity']]
balance_sheet.head(5)

Unnamed: 0,Stkcd,month,total_assets,total_shareholders_equity
2,1,2000-06,49732336516,3078512556
3,1,2000-12,67227499769,4738883655
4,1,2001-01,66006167607,3517551493
5,1,2001-06,85181426762,4961824149
6,1,2001-12,120126983351,3627668792


In [46]:
monthly_stock_return = monthly_stock_return.merge(balance_sheet, left_on=['Stkcd', 'month'],
                                                  right_on=['Stkcd', 'month'], how='left')
monthly_stock_return['total_assets'] = monthly_stock_return['total_assets'].ffill().astype(
    np.float64)  # ffill means use the value of the last row to fill the missing value
monthly_stock_return['total_shareholders_equity'] = monthly_stock_return['total_shareholders_equity'].ffill().astype(
    np.float64)
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,S or B,total_assets,total_shareholders_equity
0,1,2000-01,0.061891,28755730000.0,B,,
1,1,2000-02,-0.011333,28429840000.0,B,,
2,1,2000-03,0.00273,28507430000.0,B,,
3,1,2000-04,0.037016,29562690000.0,B,,
4,1,2000-05,-0.055116,27933250000.0,B,,
5,1,2000-06,0.00722,28134990000.0,B,49732340000.0,3078513000.0
6,1,2000-07,0.020961,28724690000.0,B,49732340000.0,3078513000.0
7,1,2000-08,-0.041059,27545290000.0,B,49732340000.0,3078513000.0
8,1,2000-09,-0.044508,26319330000.0,B,49732340000.0,3078513000.0
9,1,2000-10,0.034786,27234920000.0,B,49732340000.0,3078513000.0


In [47]:
monthly_stock_return['BM ratio'] = monthly_stock_return['total_shareholders_equity'] / monthly_stock_return[
    'market_value']  # calculate the book-to-market ratio
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,S or B,total_assets,total_shareholders_equity,BM ratio
0,1,2000-01,0.061891,28755730000.0,B,,,
1,1,2000-02,-0.011333,28429840000.0,B,,,
2,1,2000-03,0.00273,28507430000.0,B,,,
3,1,2000-04,0.037016,29562690000.0,B,,,
4,1,2000-05,-0.055116,27933250000.0,B,,,
5,1,2000-06,0.00722,28134990000.0,B,49732340000.0,3078513000.0,0.109419
6,1,2000-07,0.020961,28724690000.0,B,49732340000.0,3078513000.0,0.107173
7,1,2000-08,-0.041059,27545290000.0,B,49732340000.0,3078513000.0,0.111762
8,1,2000-09,-0.044508,26319330000.0,B,49732340000.0,3078513000.0,0.116968
9,1,2000-10,0.034786,27234920000.0,B,49732340000.0,3078513000.0,0.113036


In [48]:
def H_N_or_L(x):
    return pd.qcut(x, q=[0, .3, .7, 1], labels=['H', 'N', 'L'])


monthly_stock_return['H, N or L'] = monthly_stock_return.groupby("month")['BM ratio'].transform(
    H_N_or_L)  # divide the stocks into 3 groups according to the book-to-market ratio

In [49]:
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,S or B,total_assets,total_shareholders_equity,BM ratio,"H, N or L"
0,1,2000-01,0.061891,28755730000.0,B,,,,
1,1,2000-02,-0.011333,28429840000.0,B,,,,
2,1,2000-03,0.00273,28507430000.0,B,,,,
3,1,2000-04,0.037016,29562690000.0,B,,,,
4,1,2000-05,-0.055116,27933250000.0,B,,,,
5,1,2000-06,0.00722,28134990000.0,B,49732340000.0,3078513000.0,0.109419,H
6,1,2000-07,0.020961,28724690000.0,B,49732340000.0,3078513000.0,0.107173,H
7,1,2000-08,-0.041059,27545290000.0,B,49732340000.0,3078513000.0,0.111762,H
8,1,2000-09,-0.044508,26319330000.0,B,49732340000.0,3078513000.0,0.116968,H
9,1,2000-10,0.034786,27234920000.0,B,49732340000.0,3078513000.0,0.113036,H


### now we can tag and calculate $SMB_{B/M}$

In [50]:
monthly_stock_return['SMB_{B/M}'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return[
    'H, N or L'].astype(str)  # combine the two tags
monthly_stock_return['SMB_{B/M}'] = monthly_stock_return['SMB_{B/M}'].astype('category')
monthly_stock_return = monthly_stock_return[
    ['Stkcd', 'month', 'monthly_stock_return', 'market_value', 'total_assets', 'total_shareholders_equity', 'BM ratio',
     'S or B', 'H, N or L', 'SMB_{B/M}']]
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M}
0,1,2000-01,0.061891,28755730000.0,,,,B,,Bnan
1,1,2000-02,-0.011333,28429840000.0,,,,B,,Bnan
2,1,2000-03,0.00273,28507430000.0,,,,B,,Bnan
3,1,2000-04,0.037016,29562690000.0,,,,B,,Bnan
4,1,2000-05,-0.055116,27933250000.0,,,,B,,Bnan
5,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH
6,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH
7,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH
8,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH
9,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH


In [51]:
# get weighted retrun of each SMB_{B/M} group
source_group = monthly_stock_return.groupby(['month', 'SMB_{B/M}'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_return'], weights=x['market_value']), include_groups=False).unstack()
source_group.fillna(0, inplace=True)
source_group.reset_index(inplace=True)
source_group.set_index('month', inplace=True)
source_group = source_group.rename_axis('SMB_{B/M}').rename_axis('month', axis=1)
source_group['SMB_{B/M}'] = (source_group['SH'] + source_group['SN'] + source_group['SL']) / 3 - (
        source_group['BH'] + source_group['BN'] + source_group['BL']) / 3
source_group.head(10)

month,BH,BL,BN,Bnan,SH,SL,SN,SMB_{B/M}
SMB_{B/M},Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01,0.211263,0.0,0.206242,0.061891,0.115403,0.094549,0.096189,-0.037121
2000-02,0.199451,0.0,0.133699,-0.011333,0.12414,0.106186,0.096888,-0.001979
2000-03,0.083746,0.0,0.108042,0.00273,0.127617,0.125207,0.10897,0.056669
2000-04,0.055935,0.0,0.051127,0.037016,0.001406,0.024647,0.026939,-0.018023
2000-05,0.032373,2.551433,0.065737,-0.055116,0.038882,0.053875,0.074114,-0.827558
2000-06,0.064659,0.0,0.067178,0.0,0.015243,0.056787,0.165477,0.035223
2000-07,0.046614,0.762136,0.119292,0.0,0.060891,0.107996,0.058015,-0.233713
2000-08,-0.009001,1.128686,0.005443,0.0,0.01323,0.050118,0.018193,-0.347862
2000-09,-0.040867,-0.037499,-0.03553,0.0,-0.028955,0.007756,-0.033494,0.019734
2000-10,0.02232,0.249042,0.031048,0.0,0.039126,0.062941,0.049018,-0.050441


### now we can tag and calculate $SMB_{OP}$

OP: operating profitabily, which can be measured by ROE, so we need net income and stockholders' equity to calculate ROE.
reference: [ff5](https://bigquant.com/wiki/doc/yinzi-moxing-hiQfGBEwbG)

In [52]:
income_statement = pd.read_excel(
    'source_data/Income Statement/FS_Comins.xlsx')  # load the income statement data, a little time-consuming, 23s
income_statement = income_statement[income_statement['Typrep'] == 'A']
income_statement = income_statement.iloc[2:, :]
income_statement = income_statement[['Stkcd', 'Accper', 'B002000000']]
income_statement.columns = ['Stkcd', 'date', 'net_income']
income_statement['date'] = pd.to_datetime(income_statement['date'])
income_statement['month'] = income_statement['date'].dt.to_period('M')
income_statement = income_statement[['Stkcd', 'month', 'net_income']]
income_statement.head(5)

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,Stkcd,month,net_income
4,1,2000-12,506551785
5,1,2001-01,462975563
6,1,2001-06,223211685
7,1,2001-12,402360428
8,1,2002-01,402360428


In [53]:
balance_sheet.head(5)

Unnamed: 0,Stkcd,month,total_assets,total_shareholders_equity
2,1,2000-06,49732336516,3078512556
3,1,2000-12,67227499769,4738883655
4,1,2001-01,66006167607,3517551493
5,1,2001-06,85181426762,4961824149
6,1,2001-12,120126983351,3627668792


In [54]:
# merge balance sheet and income statement data
balance_sheet = balance_sheet.merge(income_statement, on=['Stkcd', 'month'], how='left')
balance_sheet['net_income'] = balance_sheet['net_income'].ffill().astype(np.float64)
balance_sheet['total_shareholders_equity'] = balance_sheet['total_shareholders_equity'].astype(np.float64)
balance_sheet['ROE'] = balance_sheet['net_income'] / balance_sheet['total_shareholders_equity']
balance_sheet['ROE'] = balance_sheet['ROE'].replace([np.inf, -np.inf], np.nan)
balance_sheet['ROE'] = balance_sheet['ROE'].fillna(0)
balance_sheet.head(5)

  balance_sheet['net_income'] = balance_sheet['net_income'].ffill().astype(np.float64)


Unnamed: 0,Stkcd,month,total_assets,total_shareholders_equity,net_income,ROE
0,1,2000-06,49732336516,3078513000.0,,0.0
1,1,2000-12,67227499769,4738884000.0,506551785.0,0.106893
2,1,2001-01,66006167607,3517551000.0,462975563.0,0.131619
3,1,2001-06,85181426762,4961824000.0,223211685.0,0.044986
4,1,2001-12,120126983351,3627669000.0,402360428.0,0.110914


In [55]:
def R_N_or_W(x):
    edge1 = x.quantile(.3)
    edge2 = x.quantile(.7)
    tmp = x.apply(lambda y: 'R' if y > edge2 else 'W' if y < edge1 else 'N')
    return tmp


balance_sheet['R, N or W'] = balance_sheet.groupby("month")['ROE'].transform(
    R_N_or_W)  # divide the stocks into 3 groups according to the ROE
balance_sheet['R, N or W'] = balance_sheet['R, N or W'].astype('category')
balance_sheet.head(10)

Unnamed: 0,Stkcd,month,total_assets,total_shareholders_equity,net_income,ROE,"R, N or W"
0,1,2000-06,49732336516,3078513000.0,,0.0,W
1,1,2000-12,67227499769,4738884000.0,506551785.0,0.106893,W
2,1,2001-01,66006167607,3517551000.0,462975563.0,0.131619,W
3,1,2001-06,85181426762,4961824000.0,223211685.0,0.044986,W
4,1,2001-12,120126983351,3627669000.0,402360428.0,0.110914,W
5,1,2002-01,120126983351,3627669000.0,402360428.0,0.110914,W
6,1,2002-03,110688552564,3811916000.0,183951144.0,0.048257,W
7,1,2002-06,134689020270,3941856000.0,314141342.0,0.079694,W
8,1,2002-09,160021970844,4058194000.0,419642032.0,0.103406,W
9,1,2002-12,166166379400,3768021000.0,432224930.0,0.114709,W


In [56]:
# monthly_stock_return.drop(columns=['ROE_x','ROE_y','BM ratio', 'H, N or L'], inplace=True)
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M}
0,1,2000-01,0.061891,28755730000.0,,,,B,,Bnan
1,1,2000-02,-0.011333,28429840000.0,,,,B,,Bnan
2,1,2000-03,0.00273,28507430000.0,,,,B,,Bnan
3,1,2000-04,0.037016,29562690000.0,,,,B,,Bnan
4,1,2000-05,-0.055116,27933250000.0,,,,B,,Bnan
5,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH
6,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH
7,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH
8,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH
9,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH


In [57]:
# merge ROE information into monthly_stock_return
monthly_stock_return = monthly_stock_return.merge(balance_sheet[['Stkcd', 'month', 'ROE', 'R, N or W']],
                                                  left_on=['Stkcd', 'month'], right_on=['Stkcd', 'month'], how='left')
monthly_stock_return['ROE'] = monthly_stock_return['ROE'].ffill()
monthly_stock_return['R, N or W'] = monthly_stock_return['R, N or W'].ffill()
monthly_stock_return.dropna(inplace=True)
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M},ROE,"R, N or W"
5,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH,0.0,W
6,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH,0.0,W
7,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH,0.0,W
8,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH,0.0,W
9,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH,0.0,W
10,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,B,H,BH,0.0,W
11,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,B,H,BH,0.106893,W
12,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,B,H,BH,0.131619,W
13,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,B,H,BH,0.131619,W
14,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,B,H,BH,0.131619,W


In [58]:
monthly_stock_return['SMB_{OP}'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return[
    'R, N or W'].astype(str)  # combine the two tags
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M},ROE,"R, N or W",SMB_{OP}
5,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH,0.0,W,BW
6,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH,0.0,W,BW
7,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH,0.0,W,BW
8,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH,0.0,W,BW
9,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH,0.0,W,BW
10,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,B,H,BH,0.0,W,BW
11,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,B,H,BH,0.106893,W,BW
12,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,B,H,BH,0.131619,W,BW
13,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,B,H,BH,0.131619,W,BW
14,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,B,H,BH,0.131619,W,BW


In [59]:
temp = monthly_stock_return.groupby(['month', 'SMB_{OP}']).apply(
    lambda x: np.average(x['monthly_stock_return'], weights=x['market_value']), include_groups=True).unstack().fillna(0)
temp.columns = ['BN(of OP)', 'BR', 'BW', 'SN(of OP)', 'SR', 'SW']
source_group = pd.concat([source_group, temp], axis=1)
source_group.head(5)

  temp = monthly_stock_return.groupby(['month', 'SMB_{OP}']).apply(


Unnamed: 0,BH,BL,BN,Bnan,SH,SL,SN,SMB_{B/M},BN(of OP),BR,BW,SN(of OP),SR,SW
2000-01,0.211263,0.0,0.206242,0.061891,0.115403,0.094549,0.096189,-0.037121,0.414422,0.0,0.193652,0.041702,0.156997,0.103435
2000-02,0.199451,0.0,0.133699,-0.011333,0.12414,0.106186,0.096888,-0.001979,0.147461,0.0,0.155108,0.11089,0.06932,0.111767
2000-03,0.083746,0.0,0.108042,0.00273,0.127617,0.125207,0.10897,0.056669,0.007946,0.0,0.106191,0.218151,0.313105,0.121351
2000-04,0.055935,0.0,0.051127,0.037016,0.001406,0.024647,0.026939,-0.018023,0.036121,0.0,0.05372,0.031223,0.042016,0.016209
2000-05,0.032373,2.551433,0.065737,-0.055116,0.038882,0.053875,0.074114,-0.827558,0.018638,0.0,0.06881,-0.030789,-0.104839,0.053122


In [60]:
# source_group.drop(columns=['BN', 'BR', 'BW', 'SN(of OP)', 'SR', 'SW'], inplace=True)

In [61]:
source_group['SMB_{OP}'] = (source_group['SR'] + source_group['SN(of OP)'] + source_group['SW']) / 3 - (
            source_group['BR'] + source_group['BN(of OP)'] + source_group['BW']) / 3

In [62]:
source_group.head(5)

Unnamed: 0,BH,BL,BN,Bnan,SH,SL,SN,SMB_{B/M},BN(of OP),BR,BW,SN(of OP),SR,SW,SMB_{OP}
2000-01,0.211263,0.0,0.206242,0.061891,0.115403,0.094549,0.096189,-0.037121,0.414422,0.0,0.193652,0.041702,0.156997,0.103435,-0.10198
2000-02,0.199451,0.0,0.133699,-0.011333,0.12414,0.106186,0.096888,-0.001979,0.147461,0.0,0.155108,0.11089,0.06932,0.111767,-0.003531
2000-03,0.083746,0.0,0.108042,0.00273,0.127617,0.125207,0.10897,0.056669,0.007946,0.0,0.106191,0.218151,0.313105,0.121351,0.17949
2000-04,0.055935,0.0,0.051127,0.037016,0.001406,0.024647,0.026939,-0.018023,0.036121,0.0,0.05372,0.031223,0.042016,0.016209,-0.000131
2000-05,0.032373,2.551433,0.065737,-0.055116,0.038882,0.053875,0.074114,-0.827558,0.018638,0.0,0.06881,-0.030789,-0.104839,0.053122,-0.056651


### now we can tag and calculate $SMB_{INV}$

In [63]:
def C_N_or_A(x):
    edge1 = x.quantile(.3)
    edge2 = x.quantile(.7)
    tmp = x.apply(lambda y: 'C' if y > edge2 else 'A' if y < edge1 else 'N')
    return tmp


balance_sheet['total_assets'] = balance_sheet['total_assets'].astype(np.float64)
balance_sheet['total_assets'] = balance_sheet['total_assets'].fillna(0)
balance_sheet['assets_increasing_rate'] = balance_sheet.groupby('Stkcd')[
    'total_assets'].pct_change()  # calculate the increasing rate of total assets
balance_sheet['assets_increasing_rate'] = balance_sheet['assets_increasing_rate'].replace([np.inf, -np.inf], np.nan)
balance_sheet.head(5)

Unnamed: 0,Stkcd,month,total_assets,total_shareholders_equity,net_income,ROE,"R, N or W",assets_increasing_rate
0,1,2000-06,49732340000.0,3078513000.0,,0.0,W,
1,1,2000-12,67227500000.0,4738884000.0,506551785.0,0.106893,W,0.351786
2,1,2001-01,66006170000.0,3517551000.0,462975563.0,0.131619,W,-0.018167
3,1,2001-06,85181430000.0,4961824000.0,223211685.0,0.044986,W,0.290507
4,1,2001-12,120127000000.0,3627669000.0,402360428.0,0.110914,W,0.410249


In [64]:
balance_sheet['C, N or A'] = balance_sheet.groupby("month")['assets_increasing_rate'].transform(
    C_N_or_A)  # divide the stocks into 3 groups  according to the total assets
balance_sheet['C, N or A'] = balance_sheet['C, N or A'].astype('category')

In [65]:
# merge SMB_{INV} information into monthly_stock_return
monthly_stock_return = monthly_stock_return.merge(
    balance_sheet[['Stkcd', 'month', 'assets_increasing_rate', 'C, N or A']],
    left_on=['Stkcd', 'month'], right_on=['Stkcd', 'month'], how='left')
monthly_stock_return['C, N or A'] = monthly_stock_return['C, N or A'].ffill()
monthly_stock_return['assets_increasing_rate'] = monthly_stock_return['assets_increasing_rate'].ffill()
monthly_stock_return['SMB_{INV}'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return[
    'C, N or A'].astype(str)  # combine the two tags
monthly_stock_return['SMB_{INV}'] = monthly_stock_return['SMB_{INV}'].astype('category')

In [66]:
monthly_stock_return['SMB_{INV}'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return[
    'C, N or A'].astype(str)  # combine the two tags
monthly_stock_return['SMB_{INV}'] = monthly_stock_return['SMB_{INV}'].astype('category')
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M},ROE,"R, N or W",SMB_{OP},assets_increasing_rate,"C, N or A",SMB_{INV}
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH,0.0,W,BW,,N,BN
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH,0.0,W,BW,,N,BN
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH,0.0,W,BW,,N,BN
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH,0.0,W,BW,,N,BN
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH,0.0,W,BW,,N,BN
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,B,H,BH,0.0,W,BW,,N,BN
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,B,H,BH,0.106893,W,BW,0.351786,C,BC
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,B,H,BH,0.131619,W,BW,-0.018167,N,BN
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,B,H,BH,0.131619,W,BW,-0.018167,N,BN
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,B,H,BH,0.131619,W,BW,-0.018167,N,BN


In [67]:
temp = monthly_stock_return.groupby(['month', 'SMB_{INV}']).apply(
    lambda x: np.average(x['monthly_stock_return'], weights=x['market_value']), include_groups=True).unstack().fillna(0)
temp.columns = ['BA', 'BC', 'BN(of INV)', 'SA', 'SC', 'SN(of INV)']
source_group = pd.concat([source_group, temp], axis=1)

  temp = monthly_stock_return.groupby(['month', 'SMB_{INV}']).apply(
  temp = monthly_stock_return.groupby(['month', 'SMB_{INV}']).apply(


In [68]:
source_group['SMB_{INV}'] = (source_group['SC'] + source_group['SN(of INV)'] + source_group['SA']) / 3 - (
            source_group['BC'] + source_group['BN(of INV)'] + source_group['BA']) / 3
source_group.head(10)

Unnamed: 0,BH,BL,BN,Bnan,SH,SL,SN,SMB_{B/M},BN(of OP),BR,...,SR,SW,SMB_{OP},BA,BC,BN(of INV),SA,SC,SN(of INV),SMB_{INV}
2000-01,0.211263,0.0,0.206242,0.061891,0.115403,0.094549,0.096189,-0.037121,0.414422,0.0,...,0.156997,0.103435,-0.10198,0.0,0.0,0.207798,0.0,0.0,0.102937,-0.034953
2000-02,0.199451,0.0,0.133699,-0.011333,0.12414,0.106186,0.096888,-0.001979,0.147461,0.0,...,0.06932,0.111767,-0.003531,0.0,0.0,0.154618,0.0,0.0,0.111645,-0.014324
2000-03,0.083746,0.0,0.108042,0.00273,0.127617,0.125207,0.10897,0.056669,0.007946,0.0,...,0.313105,0.121351,0.17949,0.0,0.0,0.10038,0.0,0.0,0.123342,0.007654
2000-04,0.055935,0.0,0.051127,0.037016,0.001406,0.024647,0.026939,-0.018023,0.036121,0.0,...,0.042016,0.016209,-0.000131,0.0,0.0,0.05268,0.0,0.0,0.016503,-0.012059
2000-05,0.032373,2.551433,0.065737,-0.055116,0.038882,0.053875,0.074114,-0.827558,0.018638,0.0,...,-0.104839,0.053122,-0.056651,2.956962,0.0,0.05851,0.0,0.0,0.051324,-0.988049
2000-06,0.064659,0.0,0.067178,0.0,0.015243,0.056787,0.165477,0.035223,0.027486,0.039618,...,0.0,0.061802,-0.019931,0.07198,0.038223,0.069313,0.016244,0.01612,0.072781,-0.02479
2000-07,0.046614,0.762136,0.119292,0.0,0.060891,0.107996,0.058015,-0.233713,0.055871,0.124836,...,0.0,0.082369,-0.035895,0.04849,0.042562,0.104138,0.100631,0.040962,0.083616,0.010006
2000-08,-0.009001,1.128686,0.005443,0.0,0.01323,0.050118,0.018193,-0.347862,0.041569,0.021029,...,0.0,0.032824,-0.021999,0.052715,0.007337,-0.000223,0.040489,0.008693,0.032281,0.007211
2000-09,-0.040867,-0.037499,-0.03553,0.0,-0.028955,0.007756,-0.033494,0.019734,0.007559,-0.067505,...,0.0,-0.01431,0.058338,-0.008295,0.003476,-0.044249,-0.005709,-0.036828,-0.010759,-0.001409
2000-10,0.02232,0.249042,0.031048,0.0,0.039126,0.062941,0.049018,-0.050441,-0.000529,0.034356,...,0.0,0.052404,-0.000465,0.031539,0.000516,0.03176,0.037992,0.025937,0.057438,0.019185


### we can calculate all the five factors now

In [69]:
tmp = source_group.copy()
tmp.reset_index(inplace=True)
tmp.columns = ['month', ] + list(tmp.columns[1:])

In [70]:
pd.merge(tmp, mkt_risk_premium, left_on='month', right_on='month', how='left')
# tmp.head(5)

Unnamed: 0,month,BH,BL,BN,Bnan,SH,SL,SN,SMB_{B/M},BN(of OP),...,SW,SMB_{OP},BA,BC,BN(of INV),SA,SC,SN(of INV),SMB_{INV},mkt_risk_premium
0,2000-01,0.211263,0.000000,0.206242,0.061891,0.115403,0.094549,0.096189,-0.037121,0.414422,...,0.103435,-0.101980,0.000000,0.000000,0.207798,0.000000,0.000000,0.102937,-0.034953,-0.244185
1,2000-02,0.199451,0.000000,0.133699,-0.011333,0.124140,0.106186,0.096888,-0.001979,0.147461,...,0.111767,-0.003531,0.000000,0.000000,0.154618,0.000000,0.000000,0.111645,-0.014324,-0.143940
2,2000-03,0.083746,0.000000,0.108042,0.002730,0.127617,0.125207,0.108970,0.056669,0.007946,...,0.121351,0.179490,0.000000,0.000000,0.100380,0.000000,0.000000,0.123342,0.007654,-0.373996
3,2000-04,0.055935,0.000000,0.051127,0.037016,0.001406,0.024647,0.026939,-0.018023,0.036121,...,0.016209,-0.000131,0.000000,0.000000,0.052680,0.000000,0.000000,0.016503,-0.012059,-0.357240
4,2000-05,0.032373,2.551433,0.065737,-0.055116,0.038882,0.053875,0.074114,-0.827558,0.018638,...,0.053122,-0.056651,2.956962,0.000000,0.058510,0.000000,0.000000,0.051324,-0.988049,-0.316809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,2023-08,-0.041920,-0.022058,-0.042691,0.000000,-0.043134,-0.026326,-0.029317,0.002630,-0.042835,...,-0.028439,-0.032389,-0.053714,-0.048107,-0.041682,-0.044363,-0.047758,-0.028133,0.007750,-0.333879
284,2023-09,-0.004602,-0.065936,0.001356,0.000000,-0.004402,-0.000646,-0.006988,0.019049,-0.020177,...,-0.003992,-0.081352,0.006729,-0.007207,-0.003997,-0.008652,-0.024627,-0.003766,-0.010857,-0.269051
285,2023-10,-0.019342,-0.119236,0.000550,0.000000,-0.010500,-0.014164,-0.004719,0.036215,-0.013697,...,-0.009262,0.093917,-0.014914,-0.005308,-0.017132,-0.019492,0.008747,-0.009156,0.005818,-0.243563
286,2023-11,0.002887,0.006154,0.037692,0.000000,0.046746,0.079592,0.049758,0.043121,0.019603,...,0.063311,0.025266,0.013875,0.061863,0.006694,0.052726,0.040806,0.063502,0.024868,-0.280325


In [71]:
source_group.reset_index(inplace=True)
source_group.columns = ['month', ] + list(source_group.columns[1:])
source_group = source_group.merge(mkt_risk_premium, left_on='month', right_on='month', how='left')
source_group['SMB'] = (source_group['SMB_{B/M}'] + source_group['SMB_{OP}'] + source_group['SMB_{INV}']) / 3
source_group['HML'] = (source_group['SH'] + source_group['BH']) / 2 - (source_group['SL'] + source_group['BL']) / 2
source_group['RMW'] = (source_group['SR'] + source_group['BR']) / 2 - (source_group['SW'] + source_group['BW']) / 2
source_group['CMA'] = (source_group['SC'] + source_group['BC']) / 2 - (source_group['SA'] + source_group['BA']) / 2
source_group.head(10)

Unnamed: 0,month,BH,BL,BN,Bnan,SH,SL,SN,SMB_{B/M},BN(of OP),...,BN(of INV),SA,SC,SN(of INV),SMB_{INV},mkt_risk_premium,SMB,HML,RMW,CMA
0,2000-01,0.211263,0.0,0.206242,0.061891,0.115403,0.094549,0.096189,-0.037121,0.414422,...,0.207798,0.0,0.0,0.102937,-0.034953,-0.244185,-0.058018,0.116059,-0.070045,0.0
1,2000-02,0.199451,0.0,0.133699,-0.011333,0.12414,0.106186,0.096888,-0.001979,0.147461,...,0.154618,0.0,0.0,0.111645,-0.014324,-0.14394,-0.006611,0.108703,-0.098777,0.0
2,2000-03,0.083746,0.0,0.108042,0.00273,0.127617,0.125207,0.10897,0.056669,0.007946,...,0.10038,0.0,0.0,0.123342,0.007654,-0.373996,0.081271,0.043078,0.042781,0.0
3,2000-04,0.055935,0.0,0.051127,0.037016,0.001406,0.024647,0.026939,-0.018023,0.036121,...,0.05268,0.0,0.0,0.016503,-0.012059,-0.35724,-0.010071,0.016347,-0.013956,0.0
4,2000-05,0.032373,2.551433,0.065737,-0.055116,0.038882,0.053875,0.074114,-0.827558,0.018638,...,0.05851,0.0,0.0,0.051324,-0.988049,-0.316809,-0.624086,-1.267026,-0.113386,-1.478481
5,2000-06,0.064659,0.0,0.067178,0.0,0.015243,0.056787,0.165477,0.035223,0.027486,...,0.069313,0.016244,0.01612,0.072781,-0.02479,-0.377593,-0.003166,0.011558,-0.044941,-0.01694
6,2000-07,0.046614,0.762136,0.119292,0.0,0.060891,0.107996,0.058015,-0.233713,0.055871,...,0.104138,0.100631,0.040962,0.083616,0.010006,-0.353087,-0.086534,-0.381313,-0.026827,-0.032799
7,2000-08,-0.009001,1.128686,0.005443,0.0,0.01323,0.050118,0.018193,-0.347862,0.041569,...,-0.000223,0.040489,0.008693,0.032281,0.007211,-0.41084,-0.120883,-0.587288,-0.007072,-0.038587
8,2000-09,-0.040867,-0.037499,-0.03553,0.0,-0.028955,0.007756,-0.033494,0.019734,0.007559,...,-0.044249,-0.005709,-0.036828,-0.010759,-0.001409,-0.408768,0.025554,-0.02004,-0.00711,-0.009674
9,2000-10,0.02232,0.249042,0.031048,0.0,0.039126,0.062941,0.049018,-0.050441,-0.000529,...,0.03176,0.037992,0.025937,0.057438,0.019185,-0.301681,-0.010574,-0.125269,-0.023961,-0.021539


In [72]:
# source_group.to_feather(r'temp/source_group1_in_part1.feather')  # save the result to a feather file

In [73]:
# now we have cleaned 5-factors
factors = source_group[['month', 'mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
factors.head(10)

Unnamed: 0,month,mkt_risk_premium,SMB,HML,RMW,CMA
0,2000-01,-0.244185,-0.058018,0.116059,-0.070045,0.0
1,2000-02,-0.14394,-0.006611,0.108703,-0.098777,0.0
2,2000-03,-0.373996,0.081271,0.043078,0.042781,0.0
3,2000-04,-0.35724,-0.010071,0.016347,-0.013956,0.0
4,2000-05,-0.316809,-0.624086,-1.267026,-0.113386,-1.478481
5,2000-06,-0.377593,-0.003166,0.011558,-0.044941,-0.01694
6,2000-07,-0.353087,-0.086534,-0.381313,-0.026827,-0.032799
7,2000-08,-0.41084,-0.120883,-0.587288,-0.007072,-0.038587
8,2000-09,-0.408768,0.025554,-0.02004,-0.00711,-0.009674
9,2000-10,-0.301681,-0.010574,-0.125269,-0.023961,-0.021539


### calculate the excess return of each stock

the excess return of each stock is the return of the stock minus the risk-free rate, and this step is the same as the one in 1.3.2

In [74]:
"""
calculate the excess return of each stock
tmp_return is originally defined in 1.3.2
we just need to substitute 5-factor data from variable "factors"
"""
tmp_return.drop(columns=['risk_premium', 'smb', 'hml', 'rmw', 'cma'], inplace=True)
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return
0,1,2000-01,0.061891,0.0225,0.039391
1,1,2000-02,-0.011333,0.0225,-0.033833
2,1,2000-03,0.002729,0.0225,-0.019771
3,1,2000-04,0.037017,0.0225,0.014517
4,1,2000-05,-0.055118,0.0225,-0.077618


In [75]:
tmp_return.shape

(730499, 5)

In [76]:
factors[factors['month'] == '2000-01'][['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]

Unnamed: 0,mkt_risk_premium,SMB,HML,RMW,CMA
0,-0.244185,-0.058018,0.116059,-0.070045,0.0


In [77]:
# 3min20s
a = tmp_return['Trdmnt'].apply(
    lambda x: factors[factors['month'] == x][['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']])
tmp_return.loc[:, ['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']] = pd.concat(a.tolist(), ignore_index=True)
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return,mkt_risk_premium,SMB,HML,RMW,CMA
0,1,2000-01,0.061891,0.0225,0.039391,-0.244185,-0.058018,0.116059,-0.070045,0.0
1,1,2000-02,-0.011333,0.0225,-0.033833,-0.14394,-0.006611,0.108703,-0.098777,0.0
2,1,2000-03,0.002729,0.0225,-0.019771,-0.373996,0.081271,0.043078,0.042781,0.0
3,1,2000-04,0.037017,0.0225,0.014517,-0.35724,-0.010071,0.016347,-0.013956,0.0
4,1,2000-05,-0.055118,0.0225,-0.077618,-0.316809,-0.624086,-1.267026,-0.113386,-1.478481


In [78]:
# tmp_return.loc[:,['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']] = pd.concat(a.tolist(), ignore_index=True)
# tmp_return.head(5)

In [79]:
def regress(data):
    X = data[['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
    X = sm.add_constant(X)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(0, inplace=True)
    y = data['excess_return']
    model = sm.OLS(np.array(y.astype(float)), X.astype(float))
    result = model.fit()
    params_and_tvalues = np.append(result.params, result.tvalues)
    return params_and_tvalues


betas = (tmp_return.groupby('Stkcd').apply(regress, include_groups=False)).apply(
    pd.Series)  # calculate the correlation between the stock return and the risk premium
betas.columns = ['const', 'risk_premium', 'smb', 'hml', 'rmw', 'cma', 't-const', 't-risk_premium', 't-smb', 't-hml',
                 't-rmw', 't-cma']

In [80]:
betas.head(5)

Unnamed: 0_level_0,const,risk_premium,smb,hml,rmw,cma,t-const,t-risk_premium,t-smb,t-hml,t-rmw,t-cma
Stkcd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.106799,0.352581,-0.246816,0.203954,-0.126684,-0.059273,5.427122,6.595302,-2.701947,3.03853,-2.840439,-1.128108
2,0.024158,0.083394,-0.05912,0.065337,-0.009886,-0.004131,0.929562,1.186076,-0.494588,0.742945,-0.169736,-0.060027
3,0.087967,0.402634,-0.599566,0.129124,-0.108789,0.017803,0.451073,0.685902,-0.542251,0.352647,-0.148532,0.045401
4,0.021257,0.098092,-0.253064,0.007229,-0.013899,-0.099516,0.693963,1.185896,-1.805269,0.06993,-0.203288,-1.234924
5,-0.032258,-0.054857,-0.153768,-0.020993,0.035273,0.005831,-0.696522,-0.427856,-0.712271,-0.135344,0.310853,0.046913


so in variable 'betas' above we can have the excess return of each single stock.

However, in Fama and French (2016), they use sorted portfolios to examine, (in Table 5 P9 in the paper), so we need to construct these portfolios.

## GRS test

![grs.png](img/GRS_test.jpg)

firstly, we need to construct the portfolios

In [81]:
def five_division(x):
    edge1 = x.quantile(.2)
    edge2 = x.quantile(.4)
    edge3 = x.quantile(.6)
    edge4 = x.quantile(.8)
    tmp = x.apply(lambda y: '1' if y < edge1 else '2' if y < edge2 else '3' if y < edge3 else '4' if y < edge4 else '5')
    return tmp


def four_division(x):
    edge1 = x.quantile(.25)
    edge2 = x.quantile(.5)
    edge3 = x.quantile(.75)
    tmp = x.apply(lambda y: '1' if y < edge1 else '2' if y < edge2 else '3' if y < edge3 else '4')
    return tmp


def add_rank(monthly_stock_return):
    # add 25 size-BM portfolio rank, just like in Table 5 in the paper
    monthly_stock_return['ordered_size'] = monthly_stock_return.groupby('month')['market_value'].transform(
        lambda x: pd.qcut(x, 5, labels=["1", "2", "3", "4", "5"]))
    monthly_stock_return['ordered_BM'] = monthly_stock_return.groupby('month')['BM ratio'].transform(
        lambda x: pd.qcut(x, 5, labels=["1", "2", "3", "4", "5"]))
    monthly_stock_return['size-BM-rank'] = monthly_stock_return['ordered_size'].astype(str) + monthly_stock_return[
        'ordered_BM'].astype(str)
    monthly_stock_return['size-BM-rank'] = monthly_stock_return['size-BM-rank'].astype('category')

    # add 25 size-OP portfolio rank
    monthly_stock_return['ordered_OP'] = monthly_stock_return.groupby('month')['ROE'].transform(five_division)
    monthly_stock_return['size-OP-rank'] = monthly_stock_return['ordered_size'].astype(str) + monthly_stock_return[
        'ordered_OP'].astype(str)
    monthly_stock_return['size-OP-rank'] = monthly_stock_return['size-OP-rank'].astype('category')

    # add 25 size-INV portfolio rank
    monthly_stock_return['ordered_INV'] = monthly_stock_return.groupby('month')['assets_increasing_rate'].transform(
        five_division)
    monthly_stock_return['size-INV-rank'] = monthly_stock_return['ordered_size'].astype(str) + monthly_stock_return[
        'ordered_INV'].astype(str)
    monthly_stock_return['size-INV-rank'] = monthly_stock_return['size-INV-rank'].astype('category')

    # add 32 size-BM-OP portfolio rank
    monthly_stock_return['ordered_BM_for_32'] = monthly_stock_return.groupby('month')['BM ratio'].transform(
        four_division)
    monthly_stock_return['ordered_OP_for_32'] = monthly_stock_return.groupby('month')['ROE'].transform(four_division)
    monthly_stock_return['size-BM-OP-rank'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return[
        'ordered_BM_for_32'].astype(str) + monthly_stock_return['ordered_OP_for_32'].astype(str)
    monthly_stock_return['size-BM-OP-rank'] = monthly_stock_return['size-BM-OP-rank'].astype('category')

    # add 32 size-BM-INV portfolio rank
    monthly_stock_return['ordered_INV_for_32'] = monthly_stock_return.groupby('month')[
        'assets_increasing_rate'].transform(four_division)
    monthly_stock_return['size-BM-INV-rank'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return[
        'ordered_BM_for_32'].astype(str) + monthly_stock_return['ordered_INV_for_32'].astype(str)
    monthly_stock_return['size-BM-INV-rank'] = monthly_stock_return['size-BM-INV-rank'].astype('category')

    # add 32 size-OP-INV portfolio rank
    monthly_stock_return['size-OP-INV-rank'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return[
        'ordered_OP_for_32'].astype(str) + monthly_stock_return['ordered_INV_for_32'].astype(str)
    monthly_stock_return['size-OP-INV-rank'] = monthly_stock_return['size-OP-INV-rank'].astype('category')

    return monthly_stock_return


monthly_stock_return = add_rank(monthly_stock_return)
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M},...,ordered_OP,size-OP-rank,ordered_INV,size-INV-rank,ordered_BM_for_32,ordered_OP_for_32,size-BM-OP-rank,ordered_INV_for_32,size-BM-INV-rank,size-OP-INV-rank
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH,...,1,51,5,55,1,1,B11,4,B14,B14
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH,...,1,51,5,55,1,1,B11,4,B14,B14
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH,...,1,51,5,55,1,1,B11,4,B14,B14
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH,...,1,51,5,55,1,1,B11,4,B14,B14
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH,...,1,51,5,55,1,1,B11,4,B14,B14
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,B,H,BH,...,1,51,5,55,1,1,B11,4,B14,B14
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,B,H,BH,...,5,55,5,55,1,4,B14,4,B14,B44
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,B,H,BH,...,5,55,1,51,1,4,B14,1,B11,B41
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,B,H,BH,...,5,55,1,51,1,4,B14,1,B11,B41
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,B,H,BH,...,5,55,1,51,1,4,B14,1,B11,B41


In [82]:
# add rf to monthly_stock_return and get monthly_stock_excess_return
# a little time-consuming, about 2min30s
monthly_stock_return['rf'] = monthly_stock_return['month'].apply(
    lambda x: rf_monthly[rf_monthly['month'] == x]['rf'].values[0])
monthly_stock_return['monthly_stock_excess_return'] = monthly_stock_return['monthly_stock_return'] - \
                                                      monthly_stock_return['rf']
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M},...,ordered_INV,size-INV-rank,ordered_BM_for_32,ordered_OP_for_32,size-BM-OP-rank,ordered_INV_for_32,size-BM-INV-rank,size-OP-INV-rank,rf,monthly_stock_excess_return
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.01528
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.001539
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.063559
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.067008
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,0.012286
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.012097
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,B,H,BH,...,5,55,1,4,B14,4,B14,B44,0.0225,-0.085123
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,B,H,BH,...,1,51,1,4,B14,1,B11,B41,0.0225,0.009179
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,B,H,BH,...,1,51,1,4,B14,1,B11,B41,0.0225,-0.081911
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,B,H,BH,...,1,51,1,4,B14,1,B11,B41,0.0225,0.128673


In [83]:
# get portfolios' monthly value-weighted return
size_bm_excess_return = monthly_stock_return.groupby(['month', 'size-BM-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_op_excess_return = monthly_stock_return.groupby(['month', 'size-OP-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_inv_excess_return = monthly_stock_return.groupby(['month', 'size-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_bm_op_excess_return = monthly_stock_return.groupby(['month', 'size-BM-OP-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_bm_inv_excess_return = monthly_stock_return.groupby(['month', 'size-BM-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_op_inv_excess_return = monthly_stock_return.groupby(['month', 'size-OP-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)

In [84]:
"""
    GRS Test
    copy from https://github.com/SteffenGue/GRS_Test/blob/main/GRSTest.py
    I have checked the formula in this function, and it's correct.
"""


def grs_test(resid: np.ndarray, alpha: np.ndarray, factors: np.ndarray) -> tuple:
    """ 
        Perform the Gibbons, Ross and Shaken (1989) test.
        :param resid: Matrix of residuals from the OLS of size TxK.
        :param alpha: Vector of alphas from the OLS of size Kx1.
        :param factors: Matrix of factor returns of size TxJ.
        return Test statistic and pValue of the test statistic.
    """
    # Determine the time series and assets
    iT, iK = resid.shape

    # Determine the amount of risk factors
    iJ = factors.shape[1]

    # Input size checks
    assert alpha.shape == (iK, 1)
    assert factors.shape == (iT, iJ)

    # Covariance of the residuals, variables are in columns.
    mCov = np.cov(resid, rowvar=False)

    # Mean of excess returns of the risk factors
    vMuRF = np.nanmean(factors, axis=0)

    try:
        assert vMuRF.shape == (1, iJ)
    except AssertionError:
        vMuRF = vMuRF.reshape(1, iJ)

    # Duplicate this series for T timestamps
    mMuRF = np.repeat(vMuRF, iT, axis=0)

    # Test statistic
    mCovRF = (factors - mMuRF).T @ (factors - mMuRF) / (iT - 1)
    dTestStat = (iT / iK) * ((iT - iK - iJ) / (iT - iJ - 1)) * \
                (alpha.T @ (np.linalg.inv(mCov) @ alpha)) / \
                (1 + (vMuRF @ (np.linalg.inv(mCovRF) @ vMuRF.T)))

    pVal = 1 - f.cdf(dTestStat, iK, iT - iK - 1)

    return dTestStat, pVal

In [85]:
def regress(data, factors):
    X = factors
    X = sm.add_constant(X)
    X = X.values
    y = data
    model = sm.OLS(np.array(y.astype(float)), X.astype(float))
    result = model.fit()
    alpha = result.params[0]
    resid = result.resid
    return alpha, resid


def get_alpha_and_resid(data):
    alpha = pd.DataFrame(data.apply(lambda x: x[0]))
    resid = pd.DataFrame(data.apply(lambda x: x[1]))
    alpha.columns = ['alpha', ]
    alpha.reset_index(inplace=True)
    return alpha, resid

In [86]:
def get_grs_stat(excess_return, factors):
    grs = pd.DataFrame([[0, ] * 5] * 5, columns=['GRS', 'p-value of GRS', 'A|a|', 'A|a|/A|re|', 'A(a^2)/A(re^2)'],
                       index=["HML", "HML RMW", "HML CMA", "RMW CMA", "HML RMW CMA"],
                       dtype=np.float64)  # A(re) means average excess return

    ## 3 factors
    tmp = excess_return.apply(lambda x: regress(x, factors[['mkt_risk_premium', 'SMB', 'HML']]), axis=0)
    tmp1, resid = get_alpha_and_resid(tmp)
    tmp = grs_test(resid.to_numpy(), tmp1['alpha'].to_numpy().reshape(-1, 1),
                   factors[['mkt_risk_premium', 'SMB', 'HML']].to_numpy())
    grs.iloc[0, 0] = tmp[0][0][0]
    grs.iloc[0, 1] = tmp[1][0][0]
    grs.iloc[0, 2] = tmp1['alpha'].abs().mean()

    ## 4 factors: HML RMW
    tmp = excess_return.apply(lambda x: regress(x, factors[['mkt_risk_premium', 'SMB', 'HML', 'RMW']]), axis=0)
    tmp1, resid = get_alpha_and_resid(tmp)
    tmp = grs_test(resid.to_numpy(), tmp1['alpha'].to_numpy().reshape(-1, 1),
                   factors[['mkt_risk_premium', 'SMB', 'HML', 'RMW']].to_numpy())
    grs.iloc[1, 0] = tmp[0][0][0]
    grs.iloc[1, 1] = tmp[1][0][0]
    grs.iloc[1, 2] = tmp1['alpha'].abs().mean()

    ## 4 factors: HML CMA
    tmp = excess_return.apply(lambda x: regress(x, factors[['mkt_risk_premium', 'SMB', 'HML', 'CMA']]), axis=0)
    tmp1, resid = get_alpha_and_resid(tmp)
    tmp = grs_test(resid.to_numpy(), tmp1['alpha'].to_numpy().reshape(-1, 1),
                   factors[['mkt_risk_premium', 'SMB', 'HML', 'CMA']].to_numpy())
    grs.iloc[2, 0] = tmp[0][0][0]
    grs.iloc[2, 1] = tmp[1][0][0]
    grs.iloc[2, 2] = tmp1['alpha'].abs().mean()

    ## 4 factors: RMW CMA
    tmp = excess_return.apply(lambda x: regress(x, factors[['mkt_risk_premium', 'SMB', 'RMW', 'CMA']]), axis=0)
    tmp1, resid = get_alpha_and_resid(tmp)
    tmp = grs_test(resid.to_numpy(), tmp1['alpha'].to_numpy().reshape(-1, 1),
                   factors[['mkt_risk_premium', 'SMB', 'RMW', 'CMA']].to_numpy())
    grs.iloc[3, 0] = tmp[0][0][0]
    grs.iloc[3, 1] = tmp[1][0][0]
    grs.iloc[3, 2] = tmp1['alpha'].abs().mean()

    ## 5 factors
    tmp = excess_return.apply(lambda x: regress(x, factors[['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]), axis=0)
    tmp1, resid = get_alpha_and_resid(tmp)
    tmp = grs_test(resid.to_numpy(), tmp1['alpha'].to_numpy().reshape(-1, 1),
                   factors[['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']].to_numpy())
    grs.iloc[4, 0] = tmp[0][0][0]
    grs.iloc[4, 1] = tmp[1][0][0]
    grs.iloc[4, 2] = tmp1['alpha'].abs().mean()

    excess_return['avg'] = excess_return.mean(axis=1)
    A_re = excess_return['avg'].mean()
    excess_return.drop(columns=['avg'], inplace=True)
    grs['A|a|/A|re|'] = grs['A|a|'] / A_re
    grs['A(a^2)/A(re^2)'] = grs['A|a|'] ** 2 / A_re ** 2

    return grs

In [87]:
# get grs stat sheet
grs_size_bm = get_grs_stat(size_bm_excess_return, factors)
grs_size_op = get_grs_stat(size_op_excess_return, factors)
grs_size_inv = get_grs_stat(size_inv_excess_return, factors)
grs_size_bm_op = get_grs_stat(size_bm_op_excess_return, factors)
grs_size_bm_inv = get_grs_stat(size_bm_inv_excess_return, factors)
grs_size_op_inv = get_grs_stat(size_op_inv_excess_return, factors)

In [88]:
grs_size_bm

Unnamed: 0,GRS,p-value of GRS,A|a|,A|a|/A|re|,A(a^2)/A(re^2)
HML,7.224993,1.110223e-16,0.098586,-20.872369,435.655789
HML RMW,7.07902,1.110223e-16,0.09592,-20.30798,412.41406
HML CMA,7.101663,1.110223e-16,0.098145,-20.778927,431.763827
RMW CMA,6.820461,1.110223e-16,0.090819,-19.228015,369.71657
HML RMW CMA,6.976048,1.110223e-16,0.095629,-20.246286,409.912091


In [89]:
grs_size_bm_inv

Unnamed: 0,GRS,p-value of GRS,A|a|,A|a|/A|re|,A(a^2)/A(re^2)
HML,9.836876,1.110223e-16,0.108068,-39.834555,1586.791774
HML RMW,9.544771,1.110223e-16,0.104947,-38.684153,1496.463688
HML CMA,9.864219,1.110223e-16,0.103744,-38.24084,1462.361813
RMW CMA,10.046626,1.110223e-16,0.095254,-35.111297,1232.803148
HML RMW CMA,9.608391,1.110223e-16,0.101017,-37.2357,1386.497345


In [90]:
grs_size_bm.to_excel(r'output/part1/grs_size_bm.xlsx')
grs_size_op.to_excel(r'output/part1/grs_size_op.xlsx')
grs_size_inv.to_excel(r'output/part1/grs_size_inv.xlsx')
grs_size_bm_op.to_excel(r'output/part1/grs_size_bm_op.xlsx')
grs_size_bm_inv.to_excel(r'output/part1/grs_size_bm_inv.xlsx')
grs_size_op_inv.to_excel(r'output/part1/grs_size_op_inv.xlsx')

## 1.4.2 Method 2: Construct the 5-factor model using 2*2 factors

In [91]:
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,S or B,"H, N or L",SMB_{B/M},...,ordered_INV,size-INV-rank,ordered_BM_for_32,ordered_OP_for_32,size-BM-OP-rank,ordered_INV_for_32,size-BM-INV-rank,size-OP-INV-rank,rf,monthly_stock_excess_return
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.01528
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.001539
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.063559
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.067008
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,0.012286
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,B,H,BH,...,5,55,1,1,B11,4,B14,B14,0.0225,-0.012097
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,B,H,BH,...,5,55,1,4,B14,4,B14,B44,0.0225,-0.085123
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,B,H,BH,...,1,51,1,4,B14,1,B11,B41,0.0225,0.009179
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,B,H,BH,...,1,51,1,4,B14,1,B11,B41,0.0225,-0.081911
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,B,H,BH,...,1,51,1,4,B14,1,B11,B41,0.0225,0.128673


In [92]:
monthly_stock_return = monthly_stock_return[
    ['Stkcd', 'month', 'monthly_stock_return', 'market_value', 'total_assets', 'total_shareholders_equity', 'BM ratio',
     'ROE', 'assets_increasing_rate', 'rf', 'monthly_stock_excess_return']]
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,monthly_stock_excess_return
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,-0.01528
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,-0.001539
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,-0.063559
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,-0.067008
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,0.012286
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,-0.012097
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,-0.085123
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,0.009179
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,-0.081911
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,0.128673


In [93]:
def S_or_B(x):
    edge = x.quantile(.5)
    tmp = x.apply(lambda y: 'S' if y < edge else 'B')
    return tmp


def H_or_L(x):
    edge = x.quantile(.5)
    tmp = x.apply(lambda y: 'L' if y < edge else 'H')
    return tmp


def R_or_W(x):
    edge = x.quantile(.5)
    tmp = x.apply(lambda y: 'W' if y < edge else 'R')
    return tmp


def C_or_A(x):
    edge = x.quantile(.5)
    tmp = x.apply(lambda y: 'C' if y < edge else 'A')
    return tmp


monthly_stock_return['S or B'] = monthly_stock_return.groupby("month")['market_value'].transform(
    S_or_B)  # divide the stocks into 2 groups according to the market value
monthly_stock_return['H or L'] = monthly_stock_return.groupby("month")['BM ratio'].transform(H_or_L)
monthly_stock_return['R or W'] = monthly_stock_return.groupby("month")['ROE'].transform(R_or_W)
monthly_stock_return['C or A'] = monthly_stock_return.groupby("month")['assets_increasing_rate'].transform(C_or_A)
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,monthly_stock_excess_return,S or B,H or L,R or W,C or A
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,-0.01528,B,L,W,A
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,-0.001539,B,L,W,A
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,-0.063559,B,L,W,A
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,-0.067008,B,L,W,A
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,0.012286,B,L,W,A
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,-0.012097,B,L,W,A
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,-0.085123,B,L,R,A
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,0.009179,B,L,R,C
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,-0.081911,B,L,R,C
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,0.128673,B,L,R,C


In [94]:
monthly_stock_return['size-BM'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return['H or L'].astype(
    str)
monthly_stock_return['size-OP'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return['R or W'].astype(
    str)
monthly_stock_return['size-INV'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return['C or A'].astype(
    str)

monthly_stock_return['size-BM'] = monthly_stock_return['size-BM'].astype('category')
monthly_stock_return['size-OP'] = monthly_stock_return['size-OP'].astype('category')
monthly_stock_return['size-INV'] = monthly_stock_return['size-INV'].astype('category')

monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,monthly_stock_excess_return,S or B,H or L,R or W,C or A,size-BM,size-OP,size-INV
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,-0.01528,B,L,W,A,BL,BW,BA
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,-0.001539,B,L,W,A,BL,BW,BA
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,-0.063559,B,L,W,A,BL,BW,BA
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,-0.067008,B,L,W,A,BL,BW,BA
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,0.012286,B,L,W,A,BL,BW,BA
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,-0.012097,B,L,W,A,BL,BW,BA
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,-0.085123,B,L,R,A,BL,BR,BA
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,0.009179,B,L,R,C,BL,BR,BC
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,-0.081911,B,L,R,C,BL,BR,BC
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,0.128673,B,L,R,C,BL,BR,BC


In [95]:
# get portfolios' monthly value-weighted return
tmp1 = monthly_stock_return.groupby(['month', 'size-BM'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
tmp2 = monthly_stock_return.groupby(['month', 'size-OP'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
tmp3 = monthly_stock_return.groupby(['month', 'size-INV'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)

In [96]:
source_group2 = pd.concat([tmp1, tmp2, tmp3], axis=1)
source_group2 = source_group2.merge(mkt_risk_premium, left_on='month', right_on='month', how='left')
source_group2

Unnamed: 0,month,BH,BL,SH,SL,BR,BW,SR,SW,BA,BC,SA,SC,mkt_risk_premium
0,2000-01,0.126442,0.198886,0.073895,0.090960,0.190532,0.046225,0.087112,0.042091,0.181809,0.204676,0.079939,0.083963,-0.244185
1,2000-02,0.099394,0.139327,0.082207,0.100308,0.128194,0.222065,0.086709,0.105413,0.135020,0.114917,0.083139,0.113858,-0.143940
2,2000-03,0.090282,0.075096,0.100454,0.101725,0.070956,0.233654,0.099246,0.110978,0.076131,0.087716,0.100498,0.102736,-0.373996
3,2000-04,0.010775,0.034403,0.002457,-0.019574,0.028958,0.054892,-0.004133,-0.017143,0.030932,0.025392,-0.004208,-0.012462,-0.357240
4,2000-05,0.089361,0.032824,0.036829,0.016839,0.034940,0.217947,0.030690,0.019230,0.040456,0.060579,0.032888,0.014762,-0.316809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,2023-08,-0.078471,-0.056682,-0.042736,-0.062042,-0.056349,-0.083141,-0.042905,-0.062278,-0.056753,-0.069427,-0.043124,-0.065549,-0.333879
284,2023-09,-0.030407,-0.018664,-0.018946,-0.020614,-0.018971,-0.009891,-0.019051,-0.017896,-0.018741,-0.022968,-0.018882,-0.025290,-0.269051
285,2023-10,-0.064948,-0.031459,-0.024001,-0.027933,-0.032195,-0.015189,-0.023992,-0.030451,-0.032018,-0.029207,-0.023957,-0.034066,-0.243563
286,2023-11,-0.003146,-0.007559,0.048997,0.030210,-0.007719,0.007701,0.048485,0.037628,-0.007527,-0.006112,0.048303,0.042958,-0.280325


In [97]:
source_group2['SMB'] = (source_group2['SH'] + source_group2['SL'] + source_group2['SR'] + source_group2['SW'] +
                        source_group2['SC'] + source_group2['SA']) / 6 - (
                                   source_group2['BH'] + source_group2['BL'] + source_group2['BR'] + source_group2[
                               'BW'] + source_group2['BC'] + source_group2['BA']) / 6
source_group2['HML'] = (source_group2['SH'] + source_group2['BH']) / 2 - (source_group2['SL'] + source_group2['BL']) / 2
source_group2['RMW'] = (source_group2['SR'] + source_group2['BR']) / 2 - (source_group2['SW'] + source_group2['BW']) / 2
source_group2['CMA'] = (source_group2['SC'] + source_group2['BC']) / 2 - (source_group2['SA'] + source_group2['BA']) / 2
source_group2.head(5)

Unnamed: 0,month,BH,BL,SH,SL,BR,BW,SR,SW,BA,BC,SA,SC,mkt_risk_premium,SMB,HML,RMW,CMA
0,2000-01,0.126442,0.198886,0.073895,0.09096,0.190532,0.046225,0.087112,0.042091,0.181809,0.204676,0.079939,0.083963,-0.244185,-0.081768,-0.044754,0.094664,0.013446
1,2000-02,0.099394,0.139327,0.082207,0.100308,0.128194,0.222065,0.086709,0.105413,0.13502,0.114917,0.083139,0.113858,-0.14394,-0.044547,-0.029017,-0.056287,0.005308
2,2000-03,0.090282,0.075096,0.100454,0.101725,0.070956,0.233654,0.099246,0.110978,0.076131,0.087716,0.100498,0.102736,-0.373996,-0.003033,0.006957,-0.087215,0.006912
3,2000-04,0.010775,0.034403,0.002457,-0.019574,0.028958,0.054892,-0.004133,-0.017143,0.030932,0.025392,-0.004208,-0.012462,-0.35724,-0.040069,-0.000798,-0.006462,-0.006897
4,2000-05,0.089361,0.032824,0.036829,0.016839,0.03494,0.217947,0.03069,0.01923,0.040456,0.060579,0.032888,0.014762,-0.316809,-0.054145,0.038263,-0.085774,0.000998


In [98]:
factors2 = source_group2[['month', 'mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
factors2

Unnamed: 0,month,mkt_risk_premium,SMB,HML,RMW,CMA
0,2000-01,-0.244185,-0.081768,-0.044754,0.094664,0.013446
1,2000-02,-0.143940,-0.044547,-0.029017,-0.056287,0.005308
2,2000-03,-0.373996,-0.003033,0.006957,-0.087215,0.006912
3,2000-04,-0.357240,-0.040069,-0.000798,-0.006462,-0.006897
4,2000-05,-0.316809,-0.054145,0.038263,-0.085774,0.000998
...,...,...,...,...,...,...
283,2023-08,-0.333879,0.013698,-0.001242,0.023082,-0.017549
284,2023-09,-0.269051,-0.000173,-0.005037,-0.005118,-0.005317
285,2023-10,-0.243563,0.006769,-0.014778,-0.005274,-0.003649
286,2023-11,-0.280325,0.046824,0.011600,-0.002281,-0.001965


### run regression

In [99]:
tmp_return.head(10)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return,mkt_risk_premium,SMB,HML,RMW,CMA
0,1,2000-01,0.061891,0.0225,0.039391,-0.244185,-0.058018,0.116059,-0.070045,0.0
1,1,2000-02,-0.011333,0.0225,-0.033833,-0.14394,-0.006611,0.108703,-0.098777,0.0
2,1,2000-03,0.002729,0.0225,-0.019771,-0.373996,0.081271,0.043078,0.042781,0.0
3,1,2000-04,0.037017,0.0225,0.014517,-0.35724,-0.010071,0.016347,-0.013956,0.0
4,1,2000-05,-0.055118,0.0225,-0.077618,-0.316809,-0.624086,-1.267026,-0.113386,-1.478481
5,1,2000-06,0.007222,0.0225,-0.015278,-0.377593,-0.003166,0.011558,-0.044941,-0.01694
6,1,2000-07,0.02096,0.0225,-0.00154,-0.353087,-0.086534,-0.381313,-0.026827,-0.032799
7,1,2000-08,-0.041059,0.0225,-0.063559,-0.41084,-0.120883,-0.587288,-0.007072,-0.038587
8,1,2000-09,-0.044507,0.0225,-0.067007,-0.408768,0.025554,-0.02004,-0.00711,-0.009674
9,1,2000-10,0.034788,0.0225,0.012288,-0.301681,-0.010574,-0.125269,-0.023961,-0.021539


In [100]:
tmp_return.drop(columns=['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA'], inplace=True)
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return
0,1,2000-01,0.061891,0.0225,0.039391
1,1,2000-02,-0.011333,0.0225,-0.033833
2,1,2000-03,0.002729,0.0225,-0.019771
3,1,2000-04,0.037017,0.0225,0.014517
4,1,2000-05,-0.055118,0.0225,-0.077618


In [101]:
a = tmp_return['Trdmnt'].apply(
    lambda x: factors2[factors2['month'] == x][['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']])
tmp_return.loc[:, ['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']] = pd.concat(a.tolist(), ignore_index=True)
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return,mkt_risk_premium,SMB,HML,RMW,CMA
0,1,2000-01,0.061891,0.0225,0.039391,-0.244185,-0.081768,-0.044754,0.094664,0.013446
1,1,2000-02,-0.011333,0.0225,-0.033833,-0.14394,-0.044547,-0.029017,-0.056287,0.005308
2,1,2000-03,0.002729,0.0225,-0.019771,-0.373996,-0.003033,0.006957,-0.087215,0.006912
3,1,2000-04,0.037017,0.0225,0.014517,-0.35724,-0.040069,-0.000798,-0.006462,-0.006897
4,1,2000-05,-0.055118,0.0225,-0.077618,-0.316809,-0.054145,0.038263,-0.085774,0.000998


In [102]:
def regress(data):
    X = data[['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
    X = sm.add_constant(X)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(0, inplace=True)
    y = data['excess_return']
    model = sm.OLS(np.array(y.astype(float)), X.astype(float))
    result = model.fit()
    params_and_tvalues = np.append(result.params, result.tvalues)
    return params_and_tvalues


betas2 = (tmp_return.groupby('Stkcd').apply(regress, include_groups=False)).apply(
    pd.Series)  # calculate the correlation between the stock return and the risk premium
betas2.columns = ['const', 'risk_premium', 'smb', 'hml', 'rmw', 'cma', 't-const', 't-risk_premium', 't-smb', 't-hml',
                  't-rmw', 't-cma']
betas2.head(5)

Unnamed: 0_level_0,const,risk_premium,smb,hml,rmw,cma,t-const,t-risk_premium,t-smb,t-hml,t-rmw,t-cma
Stkcd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.124116,0.382294,-0.01292,-0.410589,-0.212623,-0.047568,6.269596,7.136478,-0.147845,-2.210479,-1.435704,-1.01706
2,0.031558,0.099177,0.024538,-0.36787,0.28229,0.044442,1.244996,1.447498,0.219755,-1.54805,1.477054,0.74195
3,0.107096,0.625058,-3.58986,2.378934,-0.333583,0.624575,0.637053,1.212318,-2.799,1.615616,-0.37431,0.423553
4,0.023287,0.087426,0.014483,-0.45553,-0.043218,0.029303,0.765728,1.066431,0.108714,-1.607739,-0.191584,0.414232
5,-0.033016,-0.065413,-0.056427,-0.553752,-0.105004,0.043372,-0.723513,-0.519013,-0.286606,-1.329912,-0.310612,0.41692


## GRS test

In [103]:
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,monthly_stock_excess_return,S or B,H or L,R or W,C or A,size-BM,size-OP,size-INV
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,-0.01528,B,L,W,A,BL,BW,BA
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,-0.001539,B,L,W,A,BL,BW,BA
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,-0.063559,B,L,W,A,BL,BW,BA
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,-0.067008,B,L,W,A,BL,BW,BA
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,0.012286,B,L,W,A,BL,BW,BA
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,-0.012097,B,L,W,A,BL,BW,BA
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,-0.085123,B,L,R,A,BL,BR,BA
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,0.009179,B,L,R,C,BL,BR,BC
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,-0.081911,B,L,R,C,BL,BR,BC
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,0.128673,B,L,R,C,BL,BR,BC


In [104]:
monthly_stock_return = add_rank(monthly_stock_return)
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,...,ordered_OP,size-OP-rank,ordered_INV,size-INV-rank,ordered_BM_for_32,ordered_OP_for_32,size-BM-OP-rank,ordered_INV_for_32,size-BM-INV-rank,size-OP-INV-rank
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,...,5,55,5,55,1,4,B14,4,B14,B44
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41


In [105]:
# get portfolios' monthly value-weighted return
size_bm_excess_return2 = monthly_stock_return.groupby(['month', 'size-BM-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_op_excess_return2 = monthly_stock_return.groupby(['month', 'size-OP-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_inv_excess_return2 = monthly_stock_return.groupby(['month', 'size-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_bm_op_excess_return2 = monthly_stock_return.groupby(['month', 'size-BM-OP-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_bm_inv_excess_return2 = monthly_stock_return.groupby(['month', 'size-BM-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_op_inv_excess_return2 = monthly_stock_return.groupby(['month', 'size-OP-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)

In [106]:
def regress(data, factors):
    X = factors
    X = sm.add_constant(X)
    X = X.values
    y = data
    model = sm.OLS(np.array(y.astype(float)), X.astype(float))
    result = model.fit()
    alpha = result.params[0]
    resid = result.resid
    return alpha, resid

In [107]:
# get grs stat sheet
grs_size_bm2 = get_grs_stat(size_bm_excess_return2, factors2)
grs_size_op2 = get_grs_stat(size_op_excess_return2, factors2)
grs_size_inv2 = get_grs_stat(size_inv_excess_return2, factors2)
grs_size_bm_op2 = get_grs_stat(size_bm_op_excess_return2, factors2)
grs_size_bm_inv2 = get_grs_stat(size_bm_inv_excess_return2, factors2)
grs_size_op_inv2 = get_grs_stat(size_op_inv_excess_return2, factors2)

In [108]:
grs_size_bm2

Unnamed: 0,GRS,p-value of GRS,A|a|,A|a|/A|re|,A(a^2)/A(re^2)
HML,6.918502,1.110223e-16,0.090134,-19.082887,364.156566
HML RMW,6.981463,1.110223e-16,0.09048,-19.15622,366.960752
HML CMA,6.967943,1.110223e-16,0.089736,-18.998669,360.949415
RMW CMA,6.72373,1.110223e-16,0.096951,-20.526186,421.324321
HML RMW CMA,6.997778,1.110223e-16,0.090728,-19.208795,368.977822


In [109]:
grs_size_bm_op2

Unnamed: 0,GRS,p-value of GRS,A|a|,A|a|/A|re|,A(a^2)/A(re^2)
HML,9.440645,1.110223e-16,0.086062,-27.939058,780.590987
HML RMW,11.069032,1.110223e-16,0.086459,-28.067918,787.80803
HML CMA,9.462591,1.110223e-16,0.085566,-27.777986,771.616488
RMW CMA,11.343407,1.110223e-16,0.09235,-29.980337,898.820615
HML RMW CMA,11.200711,1.110223e-16,0.086652,-28.130539,791.327244


In [110]:
grs_size_bm2.to_excel(r'output/part1/grs_size_bm2.xlsx')
grs_size_op2.to_excel(r'output/part1/grs_size_op2.xlsx')
grs_size_inv2.to_excel(r'output/part1/grs_size_inv2.xlsx')
grs_size_bm_op2.to_excel(r'output/part1/grs_size_bm_op2.xlsx')
grs_size_bm_inv2.to_excel(r'output/part1/grs_size_bm_inv2.xlsx')
grs_size_op_inv2.to_excel(r'output/part1/grs_size_op_inv2.xlsx')

## 1.4.3 Method 3: Construct the 5-factor model using 2\*2\*2\*2 factors

In [111]:
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,...,ordered_OP,size-OP-rank,ordered_INV,size-INV-rank,ordered_BM_for_32,ordered_OP_for_32,size-BM-OP-rank,ordered_INV_for_32,size-BM-INV-rank,size-OP-INV-rank
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,...,5,55,5,55,1,4,B14,4,B14,B44
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41


In [112]:
monthly_stock_return = monthly_stock_return[
    ['Stkcd', 'month', 'monthly_stock_return', 'market_value', 'total_assets', 'total_shareholders_equity', 'BM ratio',
     'ROE', 'assets_increasing_rate', 'rf', 'monthly_stock_excess_return', 'S or B', 'H or L', 'R or W', 'C or A']]
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,monthly_stock_excess_return,S or B,H or L,R or W,C or A
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,-0.01528,B,L,W,A
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,-0.001539,B,L,W,A
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,-0.063559,B,L,W,A
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,-0.067008,B,L,W,A
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,0.012286,B,L,W,A
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,-0.012097,B,L,W,A
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,-0.085123,B,L,R,A
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,0.009179,B,L,R,C
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,-0.081911,B,L,R,C
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,0.128673,B,L,R,C


In [113]:
monthly_stock_return['tag'] = monthly_stock_return['S or B'].astype(str) + monthly_stock_return['H or L'].astype(str) + \
                              monthly_stock_return['R or W'].astype(str) + monthly_stock_return['C or A'].astype(str)
monthly_stock_return['tag'] = monthly_stock_return['tag'].astype('category')
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,monthly_stock_excess_return,S or B,H or L,R or W,C or A,tag
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,-0.01528,B,L,W,A,BLWA
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,-0.001539,B,L,W,A,BLWA
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,-0.063559,B,L,W,A,BLWA
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,-0.067008,B,L,W,A,BLWA
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,0.012286,B,L,W,A,BLWA
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,-0.012097,B,L,W,A,BLWA
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,-0.085123,B,L,R,A,BLRA
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,0.009179,B,L,R,C,BLRC
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,-0.081911,B,L,R,C,BLRC
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,0.128673,B,L,R,C,BLRC


In [114]:
source_group3 = monthly_stock_return.groupby(['month', 'tag'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
source_group3.head(10)

tag,BHRA,BHRC,BHWA,BHWC,BLRA,BLRC,BLWA,BLWC,SHRA,SHRC,SHWA,SHWC,SLRA,SLRC,SLWA,SLWC
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000-01,0.126442,0.0,0.0,0.0,0.200196,0.236882,0.123055,-0.063804,0.073895,0.0,0.0,0.0,0.115687,0.117048,0.070259,0.01696
2000-02,0.099394,0.0,0.0,0.0,0.143533,0.091559,0.183058,0.261153,0.082207,0.0,0.0,0.0,0.077533,0.114104,0.098927,0.113318
2000-03,0.090282,0.0,0.0,0.0,0.064648,0.074967,0.287178,0.172039,0.100454,0.0,0.0,0.0,0.093192,0.098558,0.111392,0.110523
2000-04,0.010775,0.0,0.0,0.0,0.033725,0.030629,0.102934,-0.009369,0.002457,0.0,0.0,0.0,-0.026335,-0.016702,-0.030466,-0.003978
2000-05,0.049662,0.0,0.0,2.934462,0.037149,0.001364,0.058377,0.056485,0.036829,0.0,0.0,0.0,0.014552,0.016283,0.027088,0.011813
2000-06,0.020104,0.0,0.0,0.0,0.05209,-0.014675,0.024029,0.113071,0.065097,0.0,0.0,0.0,-1.3e-05,-0.037585,-0.012083,0.002403
2000-07,0.066376,0.739636,0.0,0.0,0.079065,0.020871,0.044421,-0.00446,0.072019,0.0,0.0,0.0,0.02019,0.069265,0.020227,0.088052
2000-08,0.00301,0.080349,0.0,2.209267,-0.025773,-0.021282,-0.037818,-0.027639,0.01905,0.0,0.0,0.0,-0.049188,0.01323,-0.008492,0.023887
2000-09,-0.046521,-0.089944,0.0,-0.029752,-0.067921,0.023972,-0.046285,-0.007848,-0.02518,0.0,0.0,0.0,-0.069239,-0.048071,-0.065941,-0.001318
2000-10,0.041378,0.003393,0.0,0.022142,-0.003376,-0.016182,0.010382,0.025316,0.036946,0.0,0.0,0.0,0.006909,-0.014848,0.02365,0.022011


In [115]:
source_group3 = source_group3.merge(mkt_risk_premium, left_on='month', right_on='month', how='left')
source_group3

Unnamed: 0,month,BHRA,BHRC,BHWA,BHWC,BLRA,BLRC,BLWA,BLWC,SHRA,SHRC,SHWA,SHWC,SLRA,SLRC,SLWA,SLWC,mkt_risk_premium
0,2000-01,0.126442,0.000000,0.000000,0.000000,0.200196,0.236882,0.123055,-0.063804,0.073895,0.000000,0.000000,0.000000,0.115687,0.117048,0.070259,0.016960,-0.244185
1,2000-02,0.099394,0.000000,0.000000,0.000000,0.143533,0.091559,0.183058,0.261153,0.082207,0.000000,0.000000,0.000000,0.077533,0.114104,0.098927,0.113318,-0.143940
2,2000-03,0.090282,0.000000,0.000000,0.000000,0.064648,0.074967,0.287178,0.172039,0.100454,0.000000,0.000000,0.000000,0.093192,0.098558,0.111392,0.110523,-0.373996
3,2000-04,0.010775,0.000000,0.000000,0.000000,0.033725,0.030629,0.102934,-0.009369,0.002457,0.000000,0.000000,0.000000,-0.026335,-0.016702,-0.030466,-0.003978,-0.357240
4,2000-05,0.049662,0.000000,0.000000,2.934462,0.037149,0.001364,0.058377,0.056485,0.036829,0.000000,0.000000,0.000000,0.014552,0.016283,0.027088,0.011813,-0.316809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,2023-08,-0.078051,0.000000,-0.072823,-0.083801,-0.056202,-0.046542,-0.077818,-0.093512,-0.042678,0.000000,-0.094956,0.000000,-0.067155,-0.058567,-0.055727,-0.066789,-0.333879
284,2023-09,-0.022372,-0.056056,-0.040270,-0.054786,-0.018723,-0.026876,-0.013028,0.018041,-0.018956,0.000000,0.000000,0.001566,-0.039021,-0.008431,-0.002564,-0.034667,-0.269051
285,2023-10,-0.050726,-0.148793,-0.061497,-0.086551,-0.031960,-0.022758,-0.009659,0.004221,-0.024002,0.000000,0.000000,-0.022408,-0.006846,-0.045356,-0.032348,-0.028926,-0.243563
286,2023-11,0.002039,-0.005291,0.007686,-0.056524,-0.007601,-0.021285,-0.013998,0.059799,0.049031,0.000000,0.000000,-0.019479,0.015281,0.016911,0.016583,0.058151,-0.280325


In [116]:
source_group3['SMB'] = (source_group3['SHRC'] + source_group3['SHRA'] + source_group3['SHWC'] + source_group3['SHWA'] +
                        source_group3['SLRC'] + source_group3['SLRA'] + source_group3['SLWC'] + source_group3[
                            'SLWA']) / 8 - (source_group3['BHRC'] + source_group3['BHRA'] + source_group3['BHWC'] +
                                            source_group3['BHWA'] + source_group3['BLRC'] + source_group3['BLRA'] +
                                            source_group3['BLWC'] + source_group3['BLWA']) / 8
source_group3['HML'] = (source_group3['SHRC'] + source_group3['SHRA'] + source_group3['SHWC'] + source_group3['SHWA'] +
                        source_group3['BHRC'] + source_group3['BHRA'] + source_group3['BHWC'] + source_group3[
                            'BHWA']) / 8 - (source_group3['SLRC'] + source_group3['SLRA'] + source_group3['SLWC'] +
                                            source_group3['SLWA'] + source_group3['BLRC'] + source_group3['BLRA'] +
                                            source_group3['BLWC'] + source_group3['BLWA']) / 8
source_group3['RMW'] = (source_group3['SHRC'] + source_group3['SHRA'] + source_group3['SLRC'] + source_group3['SLRA'] +
                        source_group3['BHRC'] + source_group3['BHRA'] + source_group3['BLRC'] + source_group3[
                            'BLRA']) / 8 - (source_group3['SHWC'] + source_group3['SHWA'] + source_group3['SLWC'] +
                                            source_group3['SLWA'] + source_group3['BHWC'] + source_group3['BHWA'] +
                                            source_group3['BLWC'] + source_group3['BLWA']) / 8
source_group3['CMA'] = (source_group3['SHRC'] + source_group3['SHWC'] + source_group3['SLRC'] + source_group3['SLWC'] +
                        source_group3['BHRC'] + source_group3['BHWC'] + source_group3['BLRC'] + source_group3[
                            'BLWC']) / 8 - (source_group3['SHRA'] + source_group3['SHWA'] + source_group3['SLRA'] +
                                            source_group3['SLWA'] + source_group3['BHRA'] + source_group3['BHWA'] +
                                            source_group3['BLRA'] + source_group3['BLWA']) / 8

In [117]:
factors3 = source_group3[['month', 'mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
factors3

Unnamed: 0,month,mkt_risk_premium,SMB,HML,RMW,CMA
0,2000-01,-0.244185,-0.028615,-0.076993,0.090460,-0.050306
1,2000-02,-0.143940,-0.036576,-0.112698,-0.006016,-0.013065
2,2000-03,-0.373996,-0.021874,-0.102720,-0.019879,-0.036382
3,2000-04,-0.357240,-0.030465,-0.008400,-0.003071,-0.011564
4,2000-05,-0.316809,-0.378867,0.349730,-0.366548,0.349594
...,...,...,...,...,...,...
283,2023-08,-0.333879,0.015360,0.018750,0.024529,0.024525
284,2023-09,-0.269051,0.014000,-0.008200,-0.008091,-0.000784
285,2023-10,-0.243563,0.030979,-0.027543,-0.011659,-0.016692
286,2023-11,-0.280325,0.021457,-0.018297,-0.000391,-0.004592


### run regression

In [118]:
tmp_return.head(10)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return,mkt_risk_premium,SMB,HML,RMW,CMA
0,1,2000-01,0.061891,0.0225,0.039391,-0.244185,-0.081768,-0.044754,0.094664,0.013446
1,1,2000-02,-0.011333,0.0225,-0.033833,-0.14394,-0.044547,-0.029017,-0.056287,0.005308
2,1,2000-03,0.002729,0.0225,-0.019771,-0.373996,-0.003033,0.006957,-0.087215,0.006912
3,1,2000-04,0.037017,0.0225,0.014517,-0.35724,-0.040069,-0.000798,-0.006462,-0.006897
4,1,2000-05,-0.055118,0.0225,-0.077618,-0.316809,-0.054145,0.038263,-0.085774,0.000998
5,1,2000-06,0.007222,0.0225,-0.015278,-0.377593,-0.017823,0.022033,0.035237,-0.019169
6,1,2000-07,0.02096,0.0225,-0.00154,-0.353087,-0.002085,0.016255,0.031497,-0.003629
7,1,2000-08,-0.041059,0.0225,-0.063559,-0.41084,0.005132,0.037897,-0.000651,0.05865
8,1,2000-09,-0.044507,0.0225,-0.067007,-0.408768,0.011631,0.021134,-0.00368,0.044594
9,1,2000-10,0.034788,0.0225,0.012288,-0.301681,0.013758,0.031388,0.000685,-0.008852


In [119]:
tmp_return.drop(columns=['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA'], inplace=True)
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return
0,1,2000-01,0.061891,0.0225,0.039391
1,1,2000-02,-0.011333,0.0225,-0.033833
2,1,2000-03,0.002729,0.0225,-0.019771
3,1,2000-04,0.037017,0.0225,0.014517
4,1,2000-05,-0.055118,0.0225,-0.077618


In [120]:
a = tmp_return['Trdmnt'].apply(
    lambda x: factors3[factors3['month'] == x][['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']])
tmp_return.loc[:, ['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']] = pd.concat(a.tolist(), ignore_index=True)
tmp_return.head(5)

Unnamed: 0,Stkcd,Trdmnt,Mretnd,rf,excess_return,mkt_risk_premium,SMB,HML,RMW,CMA
0,1,2000-01,0.061891,0.0225,0.039391,-0.244185,-0.028615,-0.076993,0.09046,-0.050306
1,1,2000-02,-0.011333,0.0225,-0.033833,-0.14394,-0.036576,-0.112698,-0.006016,-0.013065
2,1,2000-03,0.002729,0.0225,-0.019771,-0.373996,-0.021874,-0.10272,-0.019879,-0.036382
3,1,2000-04,0.037017,0.0225,0.014517,-0.35724,-0.030465,-0.0084,-0.003071,-0.011564
4,1,2000-05,-0.055118,0.0225,-0.077618,-0.316809,-0.378867,0.34973,-0.366548,0.349594


In [121]:
def regress(data):
    X = data[['mkt_risk_premium', 'SMB', 'HML', 'RMW', 'CMA']]
    X = sm.add_constant(X)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(0, inplace=True)
    y = data['excess_return']
    model = sm.OLS(np.array(y.astype(float)), X.astype(float))
    result = model.fit()
    params_and_tvalues = np.append(result.params, result.tvalues)
    return params_and_tvalues


betas3 = (tmp_return.groupby('Stkcd').apply(regress, include_groups=False)).apply(
    pd.Series)  # calculate the correlation between the stock return and the risk premium
betas3.columns = ['const', 'risk_premium', 'smb', 'hml', 'rmw', 'cma', 't-const', 't-risk_premium', 't-smb', 't-hml',
                  't-rmw', 't-cma']
betas3.head(5)

Unnamed: 0_level_0,const,risk_premium,smb,hml,rmw,cma,t-const,t-risk_premium,t-smb,t-hml,t-rmw,t-cma
Stkcd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.033402,0.153646,-0.663053,-0.433151,0.627349,-0.224115,1.67273,2.843162,-4.954787,-6.902714,4.414218,-2.965909
2,-0.000401,0.022098,-0.288662,-0.077032,0.469198,-0.032883,-0.014009,0.286895,-1.489095,-0.863368,2.282741,-0.305038
3,0.011057,0.346555,-3.837241,0.551929,-0.456184,-4.295514,0.058473,0.626005,-3.224678,0.629196,-0.650726,-2.742093
4,-0.001322,0.046681,-0.566334,-0.023207,0.563968,0.217293,-0.038757,0.508886,-2.484827,-0.220431,2.328627,1.714355
5,-0.026274,-0.043128,0.093165,-0.051949,-0.343391,0.116343,-0.509721,-0.303294,0.270426,-0.326606,-0.938695,0.611925


## GRS test

In [122]:
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,monthly_stock_excess_return,S or B,H or L,R or W,C or A,tag
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,-0.01528,B,L,W,A,BLWA
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,-0.001539,B,L,W,A,BLWA
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,-0.063559,B,L,W,A,BLWA
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,-0.067008,B,L,W,A,BLWA
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,0.012286,B,L,W,A,BLWA
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,-0.012097,B,L,W,A,BLWA
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,-0.085123,B,L,R,A,BLRA
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,0.009179,B,L,R,C,BLRC
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,-0.081911,B,L,R,C,BLRC
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,0.128673,B,L,R,C,BLRC


In [123]:
monthly_stock_return = add_rank(monthly_stock_return)
monthly_stock_return.head(10)

Unnamed: 0,Stkcd,month,monthly_stock_return,market_value,total_assets,total_shareholders_equity,BM ratio,ROE,assets_increasing_rate,rf,...,ordered_OP,size-OP-rank,ordered_INV,size-INV-rank,ordered_BM_for_32,ordered_OP_for_32,size-BM-OP-rank,ordered_INV_for_32,size-BM-INV-rank,size-OP-INV-rank
0,1,2000-06,0.00722,28134990000.0,49732340000.0,3078513000.0,0.109419,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
1,1,2000-07,0.020961,28724690000.0,49732340000.0,3078513000.0,0.107173,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
2,1,2000-08,-0.041059,27545290000.0,49732340000.0,3078513000.0,0.111762,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
3,1,2000-09,-0.044508,26319330000.0,49732340000.0,3078513000.0,0.116968,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
4,1,2000-10,0.034786,27234920000.0,49732340000.0,3078513000.0,0.113036,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
5,1,2000-11,0.010403,30140790000.0,49732340000.0,3078513000.0,0.102138,0.0,,0.0225,...,1,51,5,55,1,1,B11,4,B14,B14
6,1,2000-12,-0.062623,28253340000.0,67227500000.0,4738884000.0,0.167728,0.106893,0.351786,0.0225,...,5,55,5,55,1,4,B14,4,B14,B44
7,1,2001-01,0.031679,29148420000.0,66006170000.0,3517551000.0,0.120677,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41
8,1,2001-02,-0.059411,27416630000.0,66006170000.0,3517551000.0,0.1283,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41
9,1,2001-03,0.151173,31561240000.0,66006170000.0,3517551000.0,0.111452,0.131619,-0.018167,0.0225,...,5,55,1,51,1,4,B14,1,B11,B41


In [124]:
# get portfolios' monthly value-weighted return
size_bm_excess_return3 = monthly_stock_return.groupby(['month', 'size-BM-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_op_excess_return3 = monthly_stock_return.groupby(['month', 'size-OP-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_inv_excess_return3 = monthly_stock_return.groupby(['month', 'size-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_bm_op_excess_return3 = monthly_stock_return.groupby(['month', 'size-BM-OP-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_bm_inv_excess_return3 = monthly_stock_return.groupby(['month', 'size-BM-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)
size_op_inv_excess_return3 = monthly_stock_return.groupby(['month', 'size-OP-INV-rank'], observed=False).apply(
    lambda x: np.average(x['monthly_stock_excess_return'], weights=x['market_value']),
    include_groups=False).unstack().fillna(0)

In [125]:
def regress(data, factors):
    X = factors
    X = sm.add_constant(X)
    X = X.values
    y = data
    model = sm.OLS(np.array(y.astype(float)), X.astype(float))
    result = model.fit()
    alpha = result.params[0]
    resid = result.resid
    return alpha, resid

In [126]:
# get grs stat sheet
grs_size_bm3 = get_grs_stat(size_bm_excess_return3, factors3)
grs_size_op3 = get_grs_stat(size_op_excess_return3, factors3)
grs_size_inv3 = get_grs_stat(size_inv_excess_return3, factors3)
grs_size_bm_op3 = get_grs_stat(size_bm_op_excess_return3, factors3)
grs_size_bm_inv3 = get_grs_stat(size_bm_inv_excess_return3, factors3)
grs_size_op_inv3 = get_grs_stat(size_op_inv_excess_return3, factors3)

In [127]:
grs_size_bm3.to_excel(r'output/part1/grs_size_bm3.xlsx', index=False)
grs_size_op3.to_excel(r'output/part1/grs_size_op3.xlsx', index=False)
grs_size_inv3.to_excel(r'output/part1/grs_size_inv3.xlsx', index=False)
grs_size_bm_op3.to_excel(r'output/part1/grs_size_bm_op3.xlsx', index=False)
grs_size_bm_inv3.to_excel(r'output/part1/grs_size_bm_inv3.xlsx', index=False)
grs_size_op_inv3.to_excel(r'output/part1/grs_size_op_inv3.xlsx', index=False)

In [128]:
grs_size_bm_op3

Unnamed: 0,GRS,p-value of GRS,A|a|,A|a|/A|re|,A(a^2)/A(re^2)
HML,6.866442,1.050271e-13,0.062207,-20.1949,407.833996
HML RMW,4.941705,2.786179e-09,0.054034,-17.541429,307.701732
HML CMA,4.860456,4.309321e-09,0.0463,-15.030655,225.920578
RMW CMA,7.220011,1.687539e-14,0.082977,-26.937548,725.631482
HML RMW CMA,4.10309,2.518969e-07,0.044111,-14.320267,205.070036


## 1.4.4 correlation of factors from different method

In [129]:
total_factors = pd.concat([factors, factors2, factors3], axis=1)
total_factors.drop(columns=['month'], inplace=True)
total_factors

Unnamed: 0,mkt_risk_premium,SMB,HML,RMW,CMA,mkt_risk_premium.1,SMB.1,HML.1,RMW.1,CMA.1,mkt_risk_premium.2,SMB.2,HML.2,RMW.2,CMA.2
0,-0.244185,-0.058018,0.116059,-0.070045,0.000000,-0.244185,-0.081768,-0.044754,0.094664,0.013446,-0.244185,-0.028615,-0.076993,0.090460,-0.050306
1,-0.143940,-0.006611,0.108703,-0.098777,0.000000,-0.143940,-0.044547,-0.029017,-0.056287,0.005308,-0.143940,-0.036576,-0.112698,-0.006016,-0.013065
2,-0.373996,0.081271,0.043078,0.042781,0.000000,-0.373996,-0.003033,0.006957,-0.087215,0.006912,-0.373996,-0.021874,-0.102720,-0.019879,-0.036382
3,-0.357240,-0.010071,0.016347,-0.013956,0.000000,-0.357240,-0.040069,-0.000798,-0.006462,-0.006897,-0.357240,-0.030465,-0.008400,-0.003071,-0.011564
4,-0.316809,-0.624086,-1.267026,-0.113386,-1.478481,-0.316809,-0.054145,0.038263,-0.085774,0.000998,-0.316809,-0.378867,0.349730,-0.366548,0.349594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,-0.333879,-0.007336,-0.018335,0.092623,0.001106,-0.333879,0.013698,-0.001242,0.023082,-0.017549,-0.333879,0.015360,0.018750,0.024529,0.024525
284,-0.269051,-0.024387,0.028789,0.136272,-0.014956,-0.269051,-0.000173,-0.005037,-0.005118,-0.005317,-0.269051,0.014000,-0.008200,-0.008091,-0.000784
285,-0.243563,0.045316,0.051779,-0.136988,0.018922,-0.243563,0.006769,-0.014778,-0.005274,-0.003649,-0.243563,0.030979,-0.027543,-0.011659,-0.016692
286,-0.280325,0.031085,-0.018057,-0.034845,0.018034,-0.280325,0.046824,0.011600,-0.002281,-0.001965,-0.280325,0.021457,-0.018297,-0.000391,-0.004592


In [130]:
total_factors.corr().to_excel(r'output/part1/total_factors_corr.xlsx')
# total_factors.var()