In [1]:
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from linearmodels import FamaMacBeth
from finance_byu.fama_macbeth import fama_macbeth, fama_macbeth_parallel, fm_summary, fama_macbeth_master
import statistics
import csv
import statsmodels.formula.api as smf

warnings.filterwarnings("ignore")

## First get the tickers of three groups of stocks 

In [2]:
benchmark = pd.read_csv('benchmark.csv')
benchmark.head()

Unnamed: 0,PERMNO,date,TICKER,COMNAM,PRC (price),VOL (share vol),SHROUT (number of shares outstanding),market cap
0,10104,31/12/2019,ORCL,ORACLE CORP,52.98,7095128,3207649,169941244
1,10107,31/12/2019,MSFT,MICROSOFT CORP,157.7,18362745,7611000,1200254700
2,10138,31/12/2019,TROW,T ROWE PRICE GROUP INC,121.84,894202,235214,28658474
3,10145,31/12/2019,HON,HONEYWELL INTERNATIONAL INC,177.0,1728875,714533,126472341
4,10516,31/12/2019,ADM,ARCHER DANIELS MIDLAND CO,46.35,1787220,556686,25802396


In [3]:
percentile_20 = np.percentile(benchmark['market cap'], 20)
median = statistics.median(benchmark['market cap'])

In [4]:
buckets = [0, percentile_20, median, max(benchmark['market cap'])]
bucket_name = ['Tiny', 'Medium', 'Large']
benchmark['size'] = pd.cut(benchmark['market cap'], buckets, labels=bucket_name)
benchmark_focused = benchmark[['TICKER', 'size']]
benchmark_focused

Unnamed: 0,TICKER,size
0,ORCL,Large
1,MSFT,Large
2,TROW,Large
3,HON,Large
4,ADM,Large
...,...,...
512,AVGO,Large
513,VRSK,Large
514,DG,Large
515,FTNT,Medium


In [5]:
tiny_cap = benchmark_focused.loc[benchmark_focused['size'] == 'Tiny']
tiny_cap = tiny_cap.drop(['size'], axis = 1)
tiny_cap.rename(columns = {'TICKER':'Tiny stocks'},inplace = True)
tiny_cap

Unnamed: 0,Tiny stocks
22,PBCT
30,NLSN
32,ITT
34,HII
37,FBHS
...,...
485,CF
487,UAA
494,HBI
498,IPGP


In [6]:
# Find the tickers of All-but-tiny stocks
All_but_tiny_cap = benchmark_focused[~benchmark_focused['TICKER'].isin(np.array(tiny_cap['Tiny stocks']))]
All_but_tiny_cap = All_but_tiny_cap.drop(['size'], axis = 1)
All_but_tiny_cap.rename(columns = {'TICKER':'All-but-tiny stocks'},inplace = True)
All_but_tiny_cap

Unnamed: 0,All-but-tiny stocks
0,ORCL
1,MSFT
2,TROW
3,HON
4,ADM
...,...
512,AVGO
513,VRSK
514,DG
515,FTNT


In [7]:
# Find the tickers of large stocks
large_cap = benchmark_focused.loc[benchmark_focused['size'] == 'Large']
large_cap = large_cap.drop(['size'],axis = 1)
large_cap.rename(columns = {'TICKER':'Large stocks'},inplace = True)
large_cap

Unnamed: 0,Large stocks
0,ORCL
1,MSFT
2,TROW
3,HON
4,ADM
...,...
508,V
510,UNH
512,AVGO
513,VRSK


## Implementing the Fama-MacBeth regressions

In [8]:
# create a csv file to store data
file = open('table 2.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(file)

# Build List Header
csv_writer.writerow([' ',' ','All stocks',' ',' ','All-but-tiny stocks',' ',' ','Large stocks'])

57

1. Import the characteristics data:

In [9]:
charac_1 = pd.read_csv('charac_1.csv')
charac_1.head()

Unnamed: 0.1,Unnamed: 0,Ticker_x,datadate,LogSize,Return_2_12,LogBM,Return
0,0,AFL,2010-01-31,3.813764,0.0,-0.549697,4.7135
1,1,AFL,2010-02-28,3.836035,0.0,-0.547469,2.6843
2,2,AFL,2010-03-31,3.930654,0.0,-0.538007,9.7877
3,3,AFL,2010-04-30,3.867935,0.0,-0.544279,-6.1337
4,4,AFL,2010-05-31,3.728179,0.0,-0.558255,-12.5196


In [10]:
charac_2 = pd.read_csv('charac_2.csv')
charac_2.head()

Unnamed: 0.1,Unnamed: 0,Ticker_x,datadate,LogSize,Return_2_12,LogIssues,LogBM,Accruals,ROA,Return,LogAG
0,0,AES,2010-01-31,2.131983,0.0,-0.0,-0.664342,-0.031168,0.02271,-5.1089,0.0
1,1,AES,2010-02-28,2.055969,0.0,-0.0,-0.671943,-0.031168,0.02271,-7.4426,0.0
2,2,AES,2010-03-31,1.996891,0.0,-0.0,-0.677851,-0.031168,0.02271,-5.9025,0.0
3,3,AES,2010-04-30,2.216551,0.0,-0.0,-0.655885,-0.031168,0.02271,4.9091,0.0
4,4,AES,2010-05-31,2.100289,0.0,-0.0,-0.667511,-0.031168,0.02271,-11.0052,0.0


In [11]:
charac_3 = pd.read_csv('charac_3.csv')
charac_3.head()

Unnamed: 0.1,Unnamed: 0,Ticker_x,datadate,LogSize,Return_2_12,LogIssues,LogBM,Accruals,ROA,Return,LogAG,LogIssues_1y,Turnover,Debtprice,Salesprice
0,0,AES,1/31/2010,2.131983,0.0,0.0,-0.664342,-0.031168,0.02271,-5.1089,0.0,0.0,0.0,3.131224,6.941273
1,1,AES,2/28/2010,2.055969,0.0,0.0,-0.671943,-0.031168,0.02271,-7.4426,0.0,0.0,0.0,3.131224,6.941273
2,2,AES,3/31/2010,1.996891,0.0,0.0,-0.677851,-0.031168,0.02271,-5.9025,0.0,0.0,0.0,3.131224,6.941273
3,3,AES,4/30/2010,2.216551,0.0,0.0,-0.655885,-0.031168,0.02271,4.9091,0.0,0.0,0.0,3.131224,6.941273
4,4,AES,5/31/2010,2.100289,0.0,0.0,-0.667511,-0.031168,0.02271,-11.0052,0.0,0.0,0.0,3.131224,6.941273


In [12]:
charac_1['datadate'] = pd.to_datetime(charac_1['datadate'], format = '%Y-%m-%d')
charac_2['datadate'] = pd.to_datetime(charac_2['datadate'], format = '%Y-%m-%d')
charac_3['datadate'] = pd.to_datetime(charac_3['datadate'], format = '%m/%d/%Y')
for v in charac_1.columns[3:]:
    charac_1[v] = pd.to_numeric(charac_1[v], errors = 'coerce')
for v in charac_2.columns[3:]:
    charac_2[v] = pd.to_numeric(charac_2[v], errors = 'coerce')
for v in charac_3.columns[3:]:
    charac_3[v] = pd.to_numeric(charac_3[v], errors = 'coerce')

In [13]:
def get_ticker_code(characs):
    count = 0
    characs['Ticker_code'] = np.zeros(len(characs.index))
    for i in range(len(characs.index)-1):
        if characs['Ticker_x'][i+1] == characs['Ticker_x'][i]:
            characs['Ticker_code'][i+1] = count
        else:
            count +=1
            characs['Ticker_code'][i+1] = count
        characs['Ticker_code'].iloc[-1] = count

In [14]:
get_ticker_code(charac_1)
get_ticker_code(charac_2)
get_ticker_code(charac_3)

In [15]:
# divide each character DataFrame to three groups of stocks
Abt_1 = charac_1[charac_1['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L_1 = charac_1[charac_1['Ticker_x'].isin(np.array(large_cap['Large stocks']))]
Abt_2 = charac_2[charac_2['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L_2 = charac_2[charac_2['Ticker_x'].isin(np.array(large_cap['Large stocks']))]
Abt_3 = charac_3[charac_3['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L_3 = charac_3[charac_3['Ticker_x'].isin(np.array(large_cap['Large stocks']))]

2. There are two ways to carry out Fama MacBeth Regression:

In [16]:
# method 1
def FamaMacBeth_(formula,time_label,df,lags):
    res = df.groupby(time_label,sort=True).apply(lambda x:smf.ols(formula,data=x).fit())
    p=pd.DataFrame([x.params for x in res],index=res.index)
    N=np.mean([x.nobs for x in res])
    r2=np.mean([x.rsquared for x in res])
    means={}
    for x in p.columns:
        if lags==0:
            means[x]=smf.ols(formula=x+'~1',
                            data=p[[x]]).fit(use_t=True)
        else:
            means[x]=smf.ols(formula=x+'~1',
                            data=p[[x]]).fit(cov_type='HAC',
                                             cov_kwds={'maxlags':lags},
                                             use_t=True)
    result=pd.DataFrame([
        [means[x].params['Intercept'],means[x].bse['Intercept'],
        means[x].tvalues['Intercept'],means[x].pvalues['Intercept']]
        for x in p.columns],index=p.columns, columns=['coef','stderr','tvalue','pvalue'])
    return result,r2

In [17]:
FamaMacBeth_('Return~0+LogSize+LogBM+Return_2_12','datadate',charac_1,lags=16)

(                 coef    stderr    tvalue        pvalue
 LogSize      0.276677  0.034017  8.133392  4.545738e-13
 LogBM       -0.779081  0.520775 -1.496004  1.373000e-01
 Return_2_12  0.613602  0.281016  2.183511  3.096052e-02,
 0.23247667779321482)

In [18]:
# method 2
result = fama_macbeth(charac_1,'datadate','Return',['LogSize','LogBM','Return_2_12'],intercept=False)
result.head()

Unnamed: 0_level_0,LogSize,LogBM,Return_2_12
datadate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-31,-0.241627,3.268249,0.0
2010-02-28,0.049318,-6.169193,0.0
2010-03-31,-0.202343,-12.788972,0.0
2010-04-30,-0.564012,-9.682236,0.0
2010-05-31,-0.343481,9.847025,0.0


In [19]:
fm_summary(result, pvalues=True)

Unnamed: 0,mean,std_error,tstat,pval
LogSize,0.276677,0.04265,6.487199,2.092018e-09
LogBM,-0.779081,0.665302,-1.171018,0.2439311
Return_2_12,0.613602,0.360269,1.703179,0.09114502


<b>Appropriate use of Econometrics/Statistics Methods</b>

To explain the linear algebra in our model 1 with constant value 

$$ X \equiv \begin{bmatrix} 1,x_1 \\1, x_2 \\ ... \\1, x_T \end{bmatrix}$$
$$ \hat{\beta} = (X'X)^{-1}X'y \equiv My$$

In [20]:
res = smf.ols('Return~LogSize+LogBM+Return_2_12', charac_1).fit(missing = 'drop')
res.summary()

0,1,2,3
Dep. Variable:,Return,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,106.1
Date:,"Mon, 10 May 2021",Prob (F-statistic):,1.8099999999999998e-68
Time:,14:53:23,Log-Likelihood:,-158940.0
No. Observations:,46359,AIC:,317900.0
Df Residuals:,46355,BIC:,317900.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.4383,0.254,21.421,0.000,4.941,5.936
LogSize,-0.0252,0.031,-0.823,0.410,-0.085,0.035
LogBM,6.7586,0.383,17.643,0.000,6.008,7.509
Return_2_12,-0.6389,0.128,-4.990,0.000,-0.890,-0.388

0,1,2,3
Omnibus:,6191.945,Durbin-Watson:,2.124
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48836.994
Skew:,0.398,Prob(JB):,0.0
Kurtosis:,7.965,Cond. No.,43.6


In [21]:
# Linear algebra 
mkt = charac_1[['LogSize','LogBM','Return_2_12']]
x = np.c_[np.ones(len(mkt)),mkt]
y = charac_1[['Return']]
M = np.linalg.inv(x.T @ x) @ x.T 
print(M @ y)


# The outputs of slopes are same. 

     Return
0  5.438273
1 -0.025185
2  6.758620
3 -0.638878


3. Estimate the models under Fama-MacBeth regressions and get the Table 2 (True method):

- Model 1

In [22]:
csv_writer.writerow([' ','Slope','t-stat','R^2','Slope','t-stat','R^2','Slope','t-stat','R^2'])
csv_writer.writerow(['Model 1:','Three predictors'])

27

In [23]:
fmdata = charac_1.set_index(['Ticker_code','datadate'])
fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12']])
res_fm = fm.fit(debiased=False)
res_fm

0,1,2,3
Dep. Variable:,Return,R-squared:,0.0332
Estimator:,FamaMacBeth,R-squared (Between):,0.6668
No. Observations:,46359,R-squared (Within):,-0.0005
Date:,"Mon, May 10 2021",R-squared (Overall):,0.0332
Time:,14:53:23,Log-likelihood,-1.592e+05
Cov. Estimator:,Fama-MacBeth Standard Cov,,
,,F-statistic:,530.41
Entities:,476,P-value,0.0000
Avg Obs:,97.393,Distribution:,"F(3,46356)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
LogSize,0.2946,0.0449,6.5538,0.0000,0.2065,0.3826
LogBM,-0.5303,0.6708,-0.7906,0.4291,-1.8450,0.7843
Return_2_12,0.6818,0.3981,1.7126,0.0868,-0.0985,1.4620


In [24]:
# Use Newey West method to adjust
T = charac_1.shape[0]
L = int(np.ceil(4 * (T / 100) ** (2 / 9)))
L

16

In [25]:
# Here bandwidth is the number of lags, defaulted to the optimal. cov_type= 'kernel' means Newey West adjustment
res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
res_fm

0,1,2,3
Dep. Variable:,Return,R-squared:,0.0332
Estimator:,FamaMacBeth,R-squared (Between):,0.6668
No. Observations:,46359,R-squared (Within):,-0.0005
Date:,"Mon, May 10 2021",R-squared (Overall):,0.0332
Time:,14:53:23,Log-likelihood,-1.592e+05
Cov. Estimator:,Fama-MacBeth Kernel Cov,,
,,F-statistic:,530.41
Entities:,476,P-value,0.0000
Avg Obs:,97.393,Distribution:,"F(3,46356)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
LogSize,0.2946,0.0378,7.7882,0.0000,0.2204,0.3687
LogBM,-0.5303,0.5939,-0.8931,0.3718,-1.6943,0.6336
Return_2_12,0.6818,0.3917,1.7406,0.0818,-0.0859,1.4495


In [26]:
def get_model1(charac):
    fmdata = charac.set_index(['Ticker_code','datadate'])
    fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12']])
    res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
    return np.array([round(res_fm.params[0:3][0], 2),round(res_fm.tstats[0:3][0], 2),round(res_fm.rsquared, 3),
                     round(res_fm.params[0:3][1], 2),round(res_fm.tstats[0:3][1], 2),
                     round(res_fm.params[0:3][2], 2),round(res_fm.tstats[0:3][2], 2)])

In [27]:
csv_writer.writerow(['LogSize',get_model1(charac_1)[0],get_model1(charac_1)[1],get_model1(charac_1)[2],
                     get_model1(Abt_1)[0],get_model1(Abt_1)[1],get_model1(Abt_1)[2],
                     get_model1(L_1)[0],get_model1(L_1)[1],get_model1(L_1)[2]])
csv_writer.writerow(['LogB/M',get_model1(charac_1)[3],get_model1(charac_1)[4],' ',get_model1(Abt_1)[3],get_model1(Abt_1)[4],' ',
                     get_model1(L_1)[3],get_model1(L_1)[4],' '])
csv_writer.writerow(['Return_2_12',get_model1(charac_1)[5],get_model1(charac_1)[6],' ',get_model1(Abt_1)[5],get_model1(Abt_1)[6],' ',
                     get_model1(L_1)[5],get_model1(L_1)[6],' '])

49

In [28]:
csv_writer.writerow(['N',' ',charac_1.shape[0],' ',' ',Abt_1.shape[0],' ',' ',L_1.shape[0]])

31

- Model 2

In [29]:
csv_writer.writerow(['Model 2:',' Seven predictors'])

28

In [30]:
def get_model2(charac):
    fmdata = charac.set_index(['Ticker_code','datadate'])
    fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG']])
    res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
    return np.array([round(res_fm.params[0:7][0], 2),round(res_fm.tstats[0:7][0], 2),round(res_fm.rsquared, 3),
                     round(res_fm.params[0:7][1], 2),round(res_fm.tstats[0:7][1], 2),
                     round(res_fm.params[0:7][2], 2),round(res_fm.tstats[0:7][2], 2),
                     round(res_fm.params[0:7][3], 2),round(res_fm.tstats[0:7][3], 2),
                     round(res_fm.params[0:7][4], 2),round(res_fm.tstats[0:7][4], 2),
                     round(res_fm.params[0:7][5], 2),round(res_fm.tstats[0:7][5], 2),
                     round(res_fm.params[0:7][6], 2),round(res_fm.tstats[0:7][6], 2)])
                               

In [31]:
csv_writer.writerow(['LogSize',get_model2(charac_2)[0],get_model2(charac_2)[1],get_model2(charac_2)[2],
                     get_model2(Abt_2)[0],get_model2(Abt_2)[1],get_model2(Abt_2)[2],
                     get_model2(L_2)[0],get_model2(L_2)[1],get_model2(L_2)[2]])
csv_writer.writerow(['LogB/M',get_model2(charac_2)[3],get_model2(charac_2)[4],' ', get_model2(Abt_2)[3],get_model2(Abt_2)[4],' ',
                     get_model2(L_2)[3],get_model2(L_2)[4],' '])
csv_writer.writerow(['Return_2_12',get_model2(charac_2)[5],get_model2(charac_2)[6],' ',get_model2(Abt_2)[5],get_model2(Abt_2)[6],' ',
                     get_model2(L_2)[5],get_model2(L_2)[6],' '])
csv_writer.writerow(['LogIssues_1_36',get_model2(charac_2)[7],get_model2(charac_2)[8],' ',get_model2(Abt_2)[7],get_model2(Abt_2)[8],' ',
                     get_model2(L_2)[7],get_model2(L_2)[8],' '])
csv_writer.writerow(['Accruals',get_model2(charac_2)[9],get_model2(charac_2)[10],' ',get_model2(Abt_2)[9],get_model2(Abt_2)[10],' ',
                     get_model2(L_2)[9],get_model2(L_2)[10],' '])
csv_writer.writerow(['ROA',get_model2(charac_2)[11],get_model2(charac_2)[12],' ',get_model2(Abt_2)[11],get_model2(Abt_2)[12],' ',
                     get_model2(L_2)[11],get_model2(L_2)[12],' '])
csv_writer.writerow(['LogAG',get_model2(charac_2)[12],get_model2(charac_2)[13],' ',get_model2(Abt_2)[12],get_model2(Abt_2)[13],' ',
                     get_model2(L_2)[12],get_model2(L_2)[13],' '])

43

In [32]:
csv_writer.writerow(['N',' ',charac_2.shape[0],' ',' ',Abt_2.shape[0],' ',' ',L_2.shape[0]])

31

- Model 3

In [33]:
csv_writer.writerow(['Model 3:',' Eleven predictors'])

29

In [34]:
def get_model3(charac):
    fmdata = charac.set_index(['Ticker_code','datadate'])
    fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG',\
                'LogIssues_1y','Turnover','Debtprice','Salesprice']])
    res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
    return np.array([round(res_fm.params[0:13][0], 2),round(res_fm.tstats[0:13][0], 2),round(res_fm.rsquared, 3),
                     round(res_fm.params[0:13][1], 2),round(res_fm.tstats[0:13][1], 2),
                     round(res_fm.params[0:13][2], 2),round(res_fm.tstats[0:13][2], 2),
                     round(res_fm.params[0:13][3], 2),round(res_fm.tstats[0:13][3], 2),
                     round(res_fm.params[0:13][4], 2),round(res_fm.tstats[0:13][4], 2),
                     round(res_fm.params[0:13][5], 2),round(res_fm.tstats[0:13][5], 2),
                     round(res_fm.params[0:13][6], 2),round(res_fm.tstats[0:13][6], 2),
                     round(res_fm.params[0:13][7], 2),round(res_fm.tstats[0:13][7], 2),
                     round(res_fm.params[0:13][8], 2),round(res_fm.tstats[0:13][8], 2),
                     round(res_fm.params[0:13][9], 2),round(res_fm.tstats[0:13][9], 2),
                     round(res_fm.params[0:13][10], 2),round(res_fm.tstats[0:13][10], 2)])
                   

In [35]:
csv_writer.writerow(['LogSize',get_model3(charac_3)[0],get_model3(charac_3)[1],get_model3(charac_3)[2],
                     get_model3(Abt_3)[0],get_model3(Abt_3)[1],get_model3(Abt_3)[2],
                     get_model3(L_3)[0],get_model3(L_3)[1],get_model3(L_3)[2]])
csv_writer.writerow(['LogB/M',get_model3(charac_3)[3],get_model3(charac_3)[4],' ',get_model3(Abt_3)[3],get_model3(Abt_3)[4],' ',
                     get_model3(L_3)[3],get_model3(L_3)[4],' '])
csv_writer.writerow(['Return_2_12',get_model3(charac_3)[5],get_model3(charac_3)[6],' ',get_model3(Abt_3)[5],get_model3(Abt_3)[6],' ',
                     get_model3(L_3)[5],get_model3(L_3)[6],' '])
csv_writer.writerow(['LogIssues_1_36',get_model3(charac_3)[7],get_model3(charac_3)[8],' ',get_model3(Abt_3)[7],get_model3(Abt_3)[8],' ',
                     get_model3(L_3)[7],get_model3(L_3)[8],' '])
csv_writer.writerow(['Accruals',get_model3(charac_3)[9],get_model3(charac_3)[10],' ',get_model3(Abt_3)[9],get_model3(Abt_3)[10],' ',
                     get_model3(L_3)[9],get_model3(L_3)[10],' '])
csv_writer.writerow(['ROA',get_model3(charac_3)[11],get_model3(charac_3)[12],' ',get_model3(Abt_3)[11],get_model3(Abt_3)[12],' ',
                     get_model3(L_3)[11],get_model3(L_3)[12],' '])
csv_writer.writerow(['LogAG',get_model3(charac_3)[13],get_model3(charac_3)[14],' ',get_model3(Abt_3)[13],get_model3(Abt_3)[14],' ',
                     get_model3(L_3)[13],get_model3(L_3)[14],' '])
csv_writer.writerow(['LogIssues_1_12',get_model3(charac_3)[15],get_model3(charac_3)[16],' ',get_model3(Abt_3)[15],get_model3(Abt_3)[16],' ',
                     get_model3(L_3)[15],get_model3(L_3)[16],' '])
csv_writer.writerow(['Turnover_1_12',get_model3(charac_3)[17],get_model3(charac_3)[18],' ',get_model3(Abt_3)[17],get_model3(Abt_3)[18],' ',
                     get_model3(L_3)[17],get_model3(L_3)[18],' '])
csv_writer.writerow(['Debt/price',get_model3(charac_3)[19],get_model3(charac_3)[20],' ',get_model3(Abt_3)[19],get_model3(Abt_3)[20],' ',
                     get_model3(L_3)[19],get_model3(L_3)[20],' '])
csv_writer.writerow(['Sales/price',get_model3(charac_3)[21],get_model3(charac_3)[22],' ',get_model3(Abt_3)[21],get_model3(Abt_3)[22],' ',
                     get_model3(L_3)[21],get_model3(L_3)[22],' '])


49

In [36]:
csv_writer.writerow(['N',' ',charac_3.shape[0],' ',' ',Abt_3.shape[0],' ',' ',L_3.shape[0]])

31

In [37]:
file.close()

4.Show the results

In [38]:
Table_2 = pd.read_csv('table 2.csv', keep_default_na=False)
Table_2

Unnamed: 0,Unnamed: 1,.1,All stocks,.2,.3,All-but-tiny stocks,.4,.5,Large stocks
,Slope,t-stat,R^2,Slope,t-stat,R^2,Slope,t-stat,R^2
Model 1:,Three predictors,,,,,,,,
LogSize,0.29,7.79,0.033,0.3,6.99,0.038,0.39,7.72,0.04
LogB/M,-0.53,-0.89,,-0.48,-0.84,,0.34,0.59,
Return_2_12,0.68,1.74,,0.32,0.73,,0.42,0.98,
N,,46359,,,37505,,,23890,
Model 2:,Seven predictors,,,,,,,,
LogSize,0.31,5.9,0.036,0.31,5.7,0.042,0.38,5.73,0.045
LogB/M,-0.31,-0.45,,-0.43,-0.6,,0.19,0.26,
Return_2_12,0.59,1.66,,0.23,0.51,,0.42,0.93,


## Implementing the Fama-French three-factor model under Fama-MacBeth regressions

1. Import the libraries:

In [39]:
from pandas_datareader.famafrench import get_available_datasets
import pandas_datareader.data as web

2. Print available datasets (here only first 5):

In [40]:
get_available_datasets()[:5]

['F-F_Research_Data_Factors',
 'F-F_Research_Data_Factors_weekly',
 'F-F_Research_Data_Factors_daily',
 'F-F_Research_Data_5_Factors_2x3',
 'F-F_Research_Data_5_Factors_2x3_daily']

3. Download the selected dataset:

In [41]:
ff_dict = web.DataReader('F-F_Research_Data_Factors', 'famafrench', 
                         start='1964-05-01')

In [42]:
ff_dict.keys()

dict_keys([0, 1, 'DESCR'])

4. Inspect the description of the dataset

In [43]:
print(ff_dict['DESCR'])

F-F Research Data Factors
-------------------------

This file was created by CMPT_ME_BEME_RETS using the 202103 CRSP database. The 1-month TBill return is from Ibbotson and Associates, Inc. Copyright 2021 Kenneth R. French

  0 : (683 rows x 4 cols)
  1 : Annual Factors: January-December (57 rows x 4 cols)


5. View the monthly dataset:

In [44]:
factor_df = ff_dict[0]
factor_df = factor_df.apply(pd.to_numeric, 
                            errors='coerce') \
                     .div(100)
factor_df.head()

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1964-05,0.0142,-0.0094,0.0199,0.0026
1964-06,0.0127,-0.0027,0.0072,0.003
1964-07,0.0174,0.0026,0.0073,0.003
1964-08,-0.0144,0.0011,0.0007,0.0028
1964-09,0.0269,-0.0053,0.0172,0.0028


In [45]:
factor_df.iloc[:,0] = factor_df.iloc[:,0]+factor_df['RF']
factor_df.rename(columns={'Mkt-RF': 'MKT'}, inplace=True) 
factor_df.reset_index(inplace=True)
factor_df = factor_df.loc[(factor_df['Date'] >= '2010-1') & (factor_df['Date'] <= '2019-12')]
factor_df

Unnamed: 0,Date,MKT,SMB,HML,RF
548,2010-01,-0.0336,0.0037,0.0033,0.0000
549,2010-02,0.0340,0.0119,0.0319,0.0000
550,2010-03,0.0632,0.0144,0.0211,0.0001
551,2010-04,0.0201,0.0486,0.0291,0.0001
552,2010-05,-0.0788,0.0014,-0.0239,0.0001
...,...,...,...,...,...
663,2019-08,-0.0242,-0.0240,-0.0485,0.0016
664,2019-09,0.0161,-0.0105,0.0677,0.0018
665,2019-10,0.0221,0.0024,-0.0188,0.0015
666,2019-11,0.0399,0.0091,-0.0205,0.0012


6. Merger the dataset:

In [46]:
rtn = charac_3.pivot_table(index = 'datadate', 
                                columns = 'Ticker_x', 
                                values= 'Return')
rtn.dropna(axis='columns', thresh=120, inplace=True)
rtn

Ticker_x,AAP,ABT,AES,AJG,AKAM,ALB,ALGN,ALK,ALXN,AME,...,VRTX,VZ,WAB,WHR,WM,WMB,WST,XOM,ZBH,ZBRA
datadate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-31,-2.5445,-1.2039,-5.1089,0.1777,-2.5256,-1.7872,5.2189,-9.3171,-5.0184,-4.7071,...,-10.3851,-9.7646,-6.1459,-6.7940,-5.2056,-1.1385,-6.9133,-5.5140,-4.7200,-7.9365
2010-02-28,3.4220,2.5312,-7.4426,5.2328,6.4777,4.9552,-3.4667,11.6784,6.7932,7.1350,...,5.5729,-1.6655,-0.4696,12.5166,3.0265,3.3589,7.2117,1.5366,1.7933,9.4636
2010-03-31,2.8922,-2.9477,-5.9025,4.8040,19.4677,14.0838,6.8508,17.8000,9.7940,6.3525,...,0.8140,7.2243,10.4352,3.6716,5.2241,7.7530,7.7022,3.0462,3.2618,3.6052
2010-04-30,7.5859,-2.0501,4.9091,7.0061,23.5837,7.1077,-12.0993,0.4366,0.9380,4.3174,...,-5.1382,-5.3030,12.9630,24.7794,0.7261,2.2078,0.1430,1.1794,2.8885,-1.8581
2010-05-31,14.7672,-7.0367,-11.0052,-6.0145,2.2920,-5.6943,-12.2353,12.7747,-8.8375,-6.1272,...,-10.7815,-4.7751,-8.8693,-3.6741,-5.3489,-16.3490,-5.9737,-10.1372,-8.1760,-5.3356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-31,-8.4241,-2.0436,-8.6957,0.3096,1.1347,-15.3920,-12.4211,-5.1925,-11.0601,-4.1067,...,8.0423,5.2289,-10.7492,-3.5675,2.0085,-4.2208,5.9663,-6.7375,3.0119,-2.7787
2019-09-30,19.9420,-1.9339,6.5884,-0.7827,2.5244,13.2148,-1.1960,8.6906,-2.7987,7.0173,...,-5.8882,3.7827,3.8289,13.8543,-3.2153,3.5593,-2.5024,3.1104,-1.2141,0.6536
2019-10-31,-1.7654,0.3107,5.1805,1.8421,-5.3403,-12.6295,39.4484,6.9635,7.6169,-0.1851,...,15.3819,1.2011,-3.4651,-3.9404,-2.4261,-7.2735,1.5372,-4.3053,0.6993,15.2638
2019-11-30,-3.3235,2.2007,10.9091,2.2473,0.7168,7.6391,9.9291,-0.1008,8.1025,8.0306,...,13.4387,-0.3804,13.4352,-5.1407,0.6238,1.8377,2.2177,2.1163,5.1002,5.4946


In [47]:
# Set specific firms
aim_stocks = np.array(rtn.columns)
# Set specific time period
filtered_charac = charac_1[charac_1['Ticker_x'].isin(aim_stocks)] #.sort_values(['Ticker_x','datadate'])
filtered_charac = filtered_charac.drop_duplicates(subset=['LogSize','Return'],keep='last')
filtered_charac = filtered_charac.reset_index(drop=True).reset_index()
filtered_charac

Unnamed: 0.1,index,Unnamed: 0,Ticker_x,datadate,LogSize,Return_2_12,LogBM,Return,Ticker_code
0,0,120,AES,2010-01-31,2.131983,0.000000,-0.664342,-5.1089,1.0
1,1,121,AES,2010-02-28,2.055969,0.000000,-0.671943,-7.4426,1.0
2,2,122,AES,2010-03-31,1.996891,0.000000,-0.677851,-5.9025,1.0
3,3,123,AES,2010-04-30,2.216551,0.000000,-0.655885,4.9091,1.0
4,4,124,AES,2010-05-31,2.100289,0.000000,-0.667511,-11.0052,1.0
...,...,...,...,...,...,...,...,...,...
18955,18955,46354,RCL,2019-08-31,3.084339,-0.011176,-0.632188,-10.3662,475.0
18956,18956,46355,RCL,2019-09-30,3.122442,-0.104664,-0.628378,4.6318,475.0
18957,18957,46356,RCL,2019-10-31,3.127381,-0.004297,-0.627884,0.4616,475.0
18958,18958,46357,RCL,2019-11-30,3.225252,-0.041921,-0.618097,10.2821,475.0


In [48]:
factors = pd.DataFrame()
for i in range(158):
    a = factor_df
    d = pd.DataFrame(a)
    factors = factors.append([d]) 
factors = factors.reset_index(drop=True).reset_index()
factors

Unnamed: 0,index,Date,MKT,SMB,HML,RF
0,0,2010-01,-0.0336,0.0037,0.0033,0.0000
1,1,2010-02,0.0340,0.0119,0.0319,0.0000
2,2,2010-03,0.0632,0.0144,0.0211,0.0001
3,3,2010-04,0.0201,0.0486,0.0291,0.0001
4,4,2010-05,-0.0788,0.0014,-0.0239,0.0001
...,...,...,...,...,...,...
18955,18955,2019-08,-0.0242,-0.0240,-0.0485,0.0016
18956,18956,2019-09,0.0161,-0.0105,0.0677,0.0018
18957,18957,2019-10,0.0221,0.0024,-0.0188,0.0015
18958,18958,2019-11,0.0399,0.0091,-0.0205,0.0012


In [49]:
FF3 = pd.merge(filtered_charac,factors,on=['index'])
Abt = FF3[FF3['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L = FF3[FF3['Ticker_x'].isin(np.array(large_cap['Large stocks']))]

7. Estimate the three-factor model under Fama-MacBeth regressions:

In [50]:
def get_Fama_French_3(charac):
    coeffs = []
    result = fama_macbeth(charac,'datadate','Return',['MKT','SMB','HML'])
    params = fm_summary(result, pvalues=True) 
    return params

In [51]:
print('All stocks')
get_Fama_French_3(FF3)

All stocks


Unnamed: 0,mean,std_error,tstat,pval
intercept,1.463552,0.37543,3.898335,0.000160682
MKT,0.165048,0.022704,7.26947,4.129978e-11
SMB,0.040379,0.009456,4.270308,3.948147e-05
HML,0.008435,0.008626,0.977831,0.3301416


In [52]:
print('All-but-tiny stocks')
get_Fama_French_3(Abt)

All-but-tiny stocks


Unnamed: 0,mean,std_error,tstat,pval
intercept,1.468435,0.359563,4.083941,8.061017e-05
MKT,0.158621,0.021591,7.346739,2.777562e-11
SMB,0.036317,0.009151,3.968792,0.0001239774
HML,0.005856,0.008177,0.716158,0.4752967


In [53]:
print('Large stocks')
get_Fama_French_3(L)

Large stocks


Unnamed: 0,mean,std_error,tstat,pval
intercept,1.470262,0.336854,4.364687,2.728965e-05
MKT,0.147907,0.019633,7.533467,1.058749e-11
SMB,0.031218,0.008499,3.67326,0.0003602031
HML,0.00185,0.007387,0.250483,0.8026454


## Rolling n-factor model

In [54]:
filtered_charac_1 = charac_1[charac_1['Ticker_x'].isin(aim_stocks)].sort_values(by='datadate')
filtered_charac_2 = charac_2[charac_2['Ticker_x'].isin(aim_stocks)].sort_values(by='datadate')
filtered_charac_3 = charac_3[charac_3['Ticker_x'].isin(aim_stocks)].sort_values(by='datadate')
Abt_1_ = filtered_charac_1[filtered_charac_1['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))].sort_values(by='datadate')
L_1_ = filtered_charac_1[charac_1['Ticker_x'].isin(np.array(large_cap['Large stocks']))].sort_values(by='datadate')
Abt_2_ = filtered_charac_2[filtered_charac_2['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))].sort_values(by='datadate')
L_2_ = filtered_charac_2[filtered_charac_2['Ticker_x'].isin(np.array(large_cap['Large stocks']))].sort_values(by='datadate')
Abt_3_ = filtered_charac_3[filtered_charac_3['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))].sort_values(by='datadate')
L_3_ = filtered_charac_3[filtered_charac_3['Ticker_x'].isin(np.array(large_cap['Large stocks']))].sort_values(by='datadate')

In [55]:
# formula1 = 'Return~0+LogSize+LogBM+Return_2_12'
# formula2 = 'Return~0+LogSize+LogBM+Return_2_12+LogIssues+Accruals+ROA+LogAG'
# formula3 = 'Return~0+LogSize+LogBM+Return_2_12+LogIssues+Accruals+ROA+LogAG+DY+Return_13_36+LogIssues_1y+Turnover+Debtprice+Salesprice'
MODEL1_VARIABLE = ['LogSize','LogBM','Return_2_12']
MODEL2_VARIABLE = ['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG']
MODEL3_VARIABLE = ['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG',\
                'LogIssues_1y','Turnover','Debtprice','Salesprice']

In [56]:
def get_model1_rolling_data(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
  
    for time in range(96): 
        start_index = time*N
        end_index = start_index + window_size*N
        data = input_data.iloc[start_index:end_index]
  

        # define and fit the regression model 
        
#         fm = FamaMacBeth(dependent = fmdata['Return'],exog = fmdata[['LogSize','LogBM','Return_2_12']])
#         ff_model = fm.fit(cov_type= 'kernel',debiased = False)#  , bandwidth = 4
#         params = FamaMacBeth_(VARIABLE,data,lags=4)[0].iloc[:,0]
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [57]:
def get_model2_rolling_data(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)

    for time in range(96): 
        start_index = time*N
        end_index = start_index + window_size*N
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG'],intercept=False)
        params = fm_summary(result).iloc[:,0] 
      
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [58]:
def get_model3_rolling_data(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)

    for time in range(96): 
        start_index = time*N
        end_index = start_index + window_size*N
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG',\
                'LogIssues_1y','Turnover','Debtprice','Salesprice'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [59]:
all_roll_1 = get_model1_rolling_data(filtered_charac_1,24)
all_roll_2 = get_model2_rolling_data(filtered_charac_2,24)
all_roll_3 = get_model3_rolling_data(filtered_charac_3,24)
abt_roll_1 = get_model1_rolling_data(Abt_1_,24)
abt_roll_2 = get_model2_rolling_data(Abt_2_,24)
abt_roll_3 = get_model3_rolling_data(Abt_3_,24)
L_roll_1 = get_model1_rolling_data(L_1_,24)
L_roll_2 = get_model2_rolling_data(L_2_,24)
L_roll_3 = get_model3_rolling_data(L_3_,24)

In [60]:
def get_rolling_return1(row,result):
    
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL1_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [61]:
def get_rolling_return2(row,result):
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL2_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [62]:
def get_rolling_return3(row,result):
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL3_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [63]:
all_roll_1_R = get_rolling_return1(filtered_charac_1,all_roll_1)
all_roll_2_R = get_rolling_return2(filtered_charac_2,all_roll_2)
all_roll_3_R = get_rolling_return3(filtered_charac_3,all_roll_3)
abt_roll_1_R = get_rolling_return1(Abt_1_,abt_roll_1)
abt_roll_2_R = get_rolling_return2(Abt_2_,abt_roll_2)
abt_roll_3_R = get_rolling_return3(Abt_3_,abt_roll_3)
L_roll_1_R = get_rolling_return1(L_1_,L_roll_1)
L_roll_2_R = get_rolling_return2(L_2_,L_roll_2)
L_roll_3_R = get_rolling_return3(L_3_,L_roll_3)

In [64]:
all_roll_3

Unnamed: 0,LogSize,LogBM,Return_2_12,LogIssues,Accruals,ROA,LogAG,LogIssues_1y,Turnover,Debtprice,Salesprice
mean,0.269213,-1.517663,-0.384553,8.356741e-15,1.489081,2.688972,-0.589056,-0.101425,-0.024661,-0.366918,-0.004249
mean,0.250106,-2.249123,-0.999619,8.319734e-15,1.360380,4.796972,-0.589056,-0.233899,-0.333936,-0.329411,-0.020970
mean,0.226158,-2.227827,-1.119916,6.654399e-15,0.871981,5.078233,-0.424358,-0.231061,-0.242259,-0.268672,-0.029288
mean,0.253685,-1.805446,-0.881786,6.728414e-15,0.940698,5.098024,-0.136003,-0.221595,-0.235839,-0.317759,-0.034121
mean,0.307979,-1.413454,-0.401208,6.900730e-15,0.536256,4.694256,0.090452,-0.178805,-0.454219,-0.434447,-0.041880
...,...,...,...,...,...,...,...,...,...,...,...
mean,0.487779,-0.302498,0.276668,1.873402e+00,-0.583067,-1.385417,1.911528,0.134975,-2.741928,-0.898903,-0.042490
mean,0.508222,-0.006412,0.367028,1.297719e+00,-0.511500,-1.645258,1.770093,0.181900,-2.573533,-0.960596,-0.020496
mean,0.437379,-0.051796,0.097333,8.369628e-01,-0.013096,-0.912917,2.275557,0.233413,-1.996378,-0.784981,-0.030561
mean,0.503129,0.423075,-0.652569,6.678268e-01,-0.124318,-0.424226,2.498691,0.320574,-2.553427,-0.846552,0.060450


In [65]:
filtered_charac_3

Unnamed: 0.1,Unnamed: 0,Ticker_x,datadate,LogSize,Return_2_12,LogIssues,LogBM,Accruals,ROA,Return,LogAG,LogIssues_1y,Turnover,Debtprice,Salesprice,Ticker_code
0,0,AES,2010-01-31,2.131983,0.000000,0.000000,-0.664342,-0.031168,0.022710,-5.1089,0.000000,0.000000,0.000000,3.131224,6.941273,0.0
24382,24382,PWR,2010-01-31,1.337917,0.000000,0.000000,-0.678343,0.002672,0.035833,-12.5720,0.000000,0.000000,0.000000,0.226984,3.663472,262.0
3911,3911,BIIB,2010-01-31,2.692646,0.000000,0.000000,-0.590076,0.002892,0.111038,0.4486,0.000000,0.000000,0.000000,0.163879,1.169742,41.0
24502,24502,DGX,2010-01-31,2.321172,0.000000,0.000000,-0.598121,-0.005047,0.088982,-7.6350,0.000000,0.000000,0.000000,0.485533,3.199152,263.0
24714,24714,REGN,2010-01-31,0.750436,0.000000,0.000000,-0.551832,0.000469,-0.095892,10.2564,0.000000,0.000000,0.000000,0.191309,0.625514,265.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20722,20722,TAP,2019-12-31,2.431589,-0.061254,-0.007886,-0.707287,-0.018063,0.008531,6.7750,-0.042401,-0.029607,0.042387,1.299312,3.620521,221.0
20602,20602,MHK,2019-12-31,2.279092,0.225889,0.034890,-0.672297,-0.008321,0.055620,-2.1454,0.021715,0.353484,0.039887,0.538452,4.082504,220.0
20207,20207,MTD,2019-12-31,2.961117,0.246402,0.074436,-0.308100,-0.000835,0.201163,10.2681,0.063064,0.279781,0.012008,0.123762,0.628837,216.0
21895,21895,NSC,2019-12-31,3.924323,0.217066,0.113005,-0.570367,-0.048804,0.071777,0.3256,0.045422,0.435264,0.045243,0.454171,0.902469,235.0


## Cumulative n-factor model

In [66]:
def get_cumulative_data1(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
    start_index = 0
  
    for time in range(96): 

        end_index = window_size*N+N*time
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [67]:
def get_cumulative_data2(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
    start_index = 0
  
    for time in range(96): 

        end_index = window_size*N+N*time
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 

        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [68]:
def get_cumulative_data3(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
    start_index = 0
  
    for time in range(96): 

        end_index = window_size*N+N*time
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG',\
                'LogIssues_1y','Turnover','Debtprice','Salesprice'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [69]:
def get_cumulative_return1(row,window_size):
    result = get_cumulative_data1(row,window_size)
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL1_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [70]:
def get_cumulative_return2(row,window_size):
    result = get_cumulative_data2(row,window_size)
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL2_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [71]:
def get_cumulative_return3(row,window_size):
    result = get_cumulative_data3(row,window_size)
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL3_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [72]:
all_cumu_1_R = get_cumulative_return1(filtered_charac_1,24)
all_cumu_2_R = get_cumulative_return2(filtered_charac_2,24)
all_cumu_3_R = get_cumulative_return3(filtered_charac_3,24)
abt_cumu_1_R = get_cumulative_return1(Abt_1_,24)
abt_cumu_2_R = get_cumulative_return2(Abt_2_,24)
abt_cumu_3_R = get_cumulative_return3(Abt_3_,24)
L_cumu_1_R = get_cumulative_return1(L_1_,24)
L_cumu_2_R = get_cumulative_return2(L_2_,24)
L_cumu_3_R = get_cumulative_return3(L_3_,24)

In [73]:
L_cumu_3_R

Unnamed: 0,expected_return
0,-7.408273
1,-1.364230
2,-0.889196
3,1.037600
4,1.664866
...,...
91,-0.440843
92,3.351417
93,3.799608
94,6.079061


## Predictive ability analysis

In [74]:
def get_Predictive_ability(row_return,expected_return_estimates,window):
    Predictive = pd.DataFrame()
    data = row_return[['datadate','Return']].groupby('datadate').agg([np.mean]).iloc[24:]
    data = data.reset_index(drop=True).reset_index()
    expected_return_estimates = expected_return_estimates.reset_index(drop=True).reset_index()
    fmdata = pd.merge(data,expected_return_estimates,on=['index'])
    fmdata = fmdata.rename(columns={('Return', 'mean'):'Return'})

    res_fm = smf.ols('Return~0+expected_return', fmdata).fit(missing = 'drop')
    Predictive['Slope'] = res_fm.params.round(2)
    Predictive['S.E.'] = res_fm.bse.round(2)
    Predictive['t-stat'] = res_fm.tvalues.round(2)
    Predictive['R^2'] = res_fm.rsquared.round(3)
    #Predictive.rename(index={'return':row_return.name},inplace=True)
    
    return Predictive

In [75]:
Predictive_ability_rolling = pd.concat([get_Predictive_ability(filtered_charac_1,all_roll_1_R,24),
                                        get_Predictive_ability(filtered_charac_2,all_roll_2_R,24),
                                        get_Predictive_ability(filtered_charac_3,all_roll_3_R,24),
                                        get_Predictive_ability(Abt_1_,abt_roll_1_R,24),
                                        get_Predictive_ability(Abt_2_,abt_roll_2_R,24),
                                        get_Predictive_ability(Abt_3_,abt_roll_3_R,24),
                                        get_Predictive_ability(L_1_,L_roll_1_R,24),
                                        get_Predictive_ability(L_2_,L_roll_2_R,24),
                                        get_Predictive_ability(L_3_,L_roll_3_R,24)])

In [76]:
Predictive_ability_cumulative = pd.concat([get_Predictive_ability(filtered_charac_1,all_cumu_1_R,24),
                                        get_Predictive_ability(filtered_charac_2,all_cumu_2_R,24),
                                        get_Predictive_ability(filtered_charac_3,all_cumu_3_R,24),
                                        get_Predictive_ability(Abt_1_,abt_cumu_1_R,24),
                                        get_Predictive_ability(Abt_2_,abt_cumu_2_R,24),
                                        get_Predictive_ability(Abt_3_,abt_cumu_3_R,24),
                                        get_Predictive_ability(L_1_,L_cumu_1_R,24),
                                        get_Predictive_ability(L_2_,L_cumu_2_R,24),
                                        get_Predictive_ability(L_3_,L_cumu_3_R,24)])

## Write the outcomes to table 3

In [77]:
properties_1 = pd.DataFrame()
properties_1['FM estimate'] = np.array(['All stocks','All stocks','All stocks','All-but-tiny stocks','All-but-tiny stocks',
                                   'All-but-tiny stocks','Large stocks','Large stocks','Large stocks'])
properties_1['Model'] = np.array(['Model 1','Model 2','Model 3','Mode 1','Mode 2','Mode 3','Model 1','Model 2','Model 3'])


properties_1['Avg'] = np.zeros(9)
properties_1['Std'] = np.zeros(9)
properties_1['p10'] = np.zeros(9)
properties_1['p90'] = np.zeros(9)
properties_1['slopes'] = np.array(['Rolling','Rolling','Rolling','Rolling','Rolling','Rolling','Rolling','Rolling','Rolling'])

for i,v in enumerate([all_roll_1_R,all_roll_2_R,all_roll_3_R,abt_roll_1_R,abt_roll_2_R,abt_roll_3_R,L_roll_1_R,L_roll_2_R,L_roll_3_R]):
    properties_1['Avg'][i] = v['expected_return'].mean().round(2)
    properties_1['Std'][i] = v['expected_return'].std().round(2)
    properties_1['p10'][i] = np.nanpercentile(v['expected_return'], 10).round(2)
    properties_1['p90'][i] = np.nanpercentile(v['expected_return'], 90).round(2)

In [78]:
properties_2 = pd.DataFrame()
properties_2['FM estimate'] = np.array(['All stocks','All stocks','All stocks','All-but-tiny stocks','All-but-tiny stocks',
                                   'All-but-tiny stocks','Large stocks','Large stocks','Large stocks'])
properties_2['Model'] = np.array(['Model 1','Model 2','Model 3','Mode 1','Mode 2','Mode 3','Model 1','Model 2','Model 3'])


properties_2['Avg'] = np.zeros(9)
properties_2['Std'] = np.zeros(9)
properties_2['p10'] = np.zeros(9)
properties_2['p90'] = np.zeros(9)
properties_2['slopes'] = np.array(['Cumulative','Cumulative','Cumulative','Cumulative','Cumulative','Cumulative',
                                   'Cumulative','Cumulative','Cumulative'])

for i,v in enumerate([all_cumu_1_R,all_cumu_2_R,all_cumu_3_R,abt_cumu_1_R,abt_cumu_2_R,abt_cumu_3_R,L_cumu_1_R,L_cumu_2_R,L_cumu_3_R]):
    properties_2['Avg'][i] = v['expected_return'].mean().round(2)
    properties_2['Std'][i] = v['expected_return'].std().round(2)
    properties_2['p10'][i] = np.nanpercentile(v['expected_return'], 10).round(2)
    properties_2['p90'][i] = np.nanpercentile(v['expected_return'], 90).round(2)

In [79]:
properties = pd.concat([properties_1,properties_2])
properties['index'] = ['12','13','14','18','19','20','24','25','26','15','16','17','21','22','23','27','28','29']
properties

Unnamed: 0,FM estimate,Model,Avg,Std,p10,p90,slopes,index
0,All stocks,Model 1,0.24,0.69,-0.82,0.98,Rolling,12
1,All stocks,Model 2,2.55,6.87,-4.77,9.92,Rolling,13
2,All stocks,Model 3,2.03,7.06,-4.73,9.63,Rolling,14
3,All-but-tiny stocks,Mode 1,0.51,0.79,-0.67,1.38,Rolling,18
4,All-but-tiny stocks,Mode 2,1.98,5.53,-3.41,8.7,Rolling,19
5,All-but-tiny stocks,Mode 3,1.96,6.16,-3.43,8.78,Rolling,20
6,Large stocks,Model 1,1.22,1.26,-0.67,2.69,Rolling,24
7,Large stocks,Model 2,2.16,5.61,-2.85,9.54,Rolling,25
8,Large stocks,Model 3,2.42,8.53,-3.13,10.78,Rolling,26
0,All stocks,Model 1,0.28,0.29,-0.05,0.54,Cumulative,15


In [80]:
predictive = pd.concat([Predictive_ability_rolling,Predictive_ability_cumulative])
predictive['index'] = ['12','13','14','18','19','20','24','25','26','15','16','17','21','22','23','27','28','29']
predictive

Unnamed: 0,Slope,S.E.,t-stat,R^2,index
expected_return,1.0,0.54,1.83,0.034,12
expected_return,0.45,0.03,15.09,0.706,13
expected_return,0.43,0.03,12.88,0.636,14
expected_return,1.19,0.4,2.98,0.085,18
expected_return,0.51,0.04,12.23,0.611,19
expected_return,0.43,0.04,10.51,0.538,20
expected_return,0.67,0.21,3.26,0.1,24
expected_return,0.4,0.05,8.3,0.42,25
expected_return,0.27,0.03,8.06,0.406,26
expected_return,3.2,0.95,3.36,0.106,15


In [81]:
table_3 = pd.merge(properties,predictive,on=['index'])
table_3.sort_values(by = 'index',inplace=True)
table_3.drop(columns=['index'],inplace=True)
table_3.set_index(['FM estimate','slopes'],inplace=True)
table_3

Unnamed: 0_level_0,Unnamed: 1_level_0,Model,Avg,Std,p10,p90,Slope,S.E.,t-stat,R^2
FM estimate,slopes,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
All stocks,Rolling,Model 1,0.24,0.69,-0.82,0.98,1.0,0.54,1.83,0.034
All stocks,Rolling,Model 2,2.55,6.87,-4.77,9.92,0.45,0.03,15.09,0.706
All stocks,Rolling,Model 3,2.03,7.06,-4.73,9.63,0.43,0.03,12.88,0.636
All stocks,Cumulative,Model 1,0.28,0.29,-0.05,0.54,3.2,0.95,3.36,0.106
All stocks,Cumulative,Model 2,2.09,4.68,-3.16,6.73,0.71,0.03,22.85,0.846
All stocks,Cumulative,Model 3,1.8,4.29,-3.15,5.99,0.76,0.04,20.35,0.813
All-but-tiny stocks,Rolling,Mode 1,0.51,0.79,-0.67,1.38,1.19,0.4,2.98,0.085
All-but-tiny stocks,Rolling,Mode 2,1.98,5.53,-3.41,8.7,0.51,0.04,12.23,0.611
All-but-tiny stocks,Rolling,Mode 3,1.96,6.16,-3.43,8.78,0.43,0.04,10.51,0.538
All-but-tiny stocks,Cumulative,Mode 1,0.56,0.29,0.21,0.93,2.38,0.57,4.2,0.156


In [82]:
# save it
# table_3.to_csv('table 3.csv')