In [1]:
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from linearmodels import FamaMacBeth
from finance_byu.fama_macbeth import fama_macbeth, fama_macbeth_parallel, fm_summary, fama_macbeth_master
import statistics
import csv
import statsmodels.formula.api as smf

warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'linearmodels'

In [None]:
!conda install linearmodels

## First get the tickers of three groups of stocks 

In [None]:
benchmark = pd.read_csv('benchmark1.csv')
benchmark.head()

In [None]:
percentile_20 = np.percentile(benchmark['market cap'], 20)
median = statistics.median(benchmark['market cap'])

In [None]:
buckets = [0, percentile_20, median, max(benchmark['market cap'])]
bucket_name = ['Tiny', 'Medium', 'Large']
benchmark['size'] = pd.cut(benchmark['market cap'], buckets, labels=bucket_name)
benchmark_focused = benchmark[['TICKER', 'size']]
benchmark_focused

In [None]:
tiny_cap = benchmark_focused.loc[benchmark_focused['size'] == 'Tiny']
tiny_cap = tiny_cap.drop(['size'], axis = 1)
tiny_cap.rename(columns = {'TICKER':'Tiny stocks'},inplace = True)
tiny_cap

In [None]:
# Find the tickers of All-but-tiny stocks
All_but_tiny_cap = benchmark_focused[~benchmark_focused['TICKER'].isin(np.array(tiny_cap['Tiny stocks']))]
All_but_tiny_cap = All_but_tiny_cap.drop(['size'], axis = 1)
All_but_tiny_cap.rename(columns = {'TICKER':'All-but-tiny stocks'},inplace = True)
All_but_tiny_cap

In [None]:
# Find the tickers of large stocks
large_cap = benchmark_focused.loc[benchmark_focused['size'] == 'Large']
large_cap = large_cap.drop(['size'],axis = 1)
large_cap.rename(columns = {'TICKER':'Large stocks'},inplace = True)
large_cap

## Implementing the Fama-MacBeth regressions

In [None]:
# create a csv file to store data
file = open('table 2.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(file)

# Build List Header
csv_writer.writerow([' ',' ','All stocks',' ',' ','All-but-tiny stocks',' ',' ','Large stocks'])

1. Import the characteristics data:

In [None]:
charac_1 = pd.read_csv('charac_1.csv')
charac_1.head()

In [None]:
charac_2 = pd.read_csv('charac_2.csv')
charac_2.head()

In [None]:
charac_3 = pd.read_csv('charac_3.csv')
charac_3.head()

In [None]:
charac_1['datadate'] = pd.to_datetime(charac_1['datadate'], format = '%Y-%m-%d')
charac_2['datadate'] = pd.to_datetime(charac_2['datadate'], format = '%Y-%m-%d')
charac_3['datadate'] = pd.to_datetime(charac_3['datadate'], format = '%Y-%m-%d')
for v in charac_1.columns[3:]:
    charac_1[v] = pd.to_numeric(charac_1[v], errors = 'coerce')
for v in charac_2.columns[3:]:
    charac_2[v] = pd.to_numeric(charac_2[v], errors = 'coerce')
for v in charac_3.columns[3:]:
    charac_3[v] = pd.to_numeric(charac_3[v], errors = 'coerce')

In [None]:
def get_ticker_code(characs):
    count = 0
    characs['Ticker_code'] = np.zeros(len(characs.index))
    for i in range(len(characs.index)-1):
        if characs['Ticker_x'][i+1] == characs['Ticker_x'][i]:
            characs['Ticker_code'][i+1] = count
        else:
            count +=1
            characs['Ticker_code'][i+1] = count
        characs['Ticker_code'].iloc[-1] = count

In [None]:
get_ticker_code(charac_1)
get_ticker_code(charac_2)
get_ticker_code(charac_3)

In [None]:
# divide each character DataFrame to three groups of stocks
Abt_1 = charac_1[charac_1['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L_1 = charac_1[charac_1['Ticker_x'].isin(np.array(large_cap['Large stocks']))]
Abt_2 = charac_2[charac_2['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L_2 = charac_2[charac_2['Ticker_x'].isin(np.array(large_cap['Large stocks']))]
Abt_3 = charac_3[charac_3['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L_3 = charac_3[charac_3['Ticker_x'].isin(np.array(large_cap['Large stocks']))]

2. There are two ways to carry out Fama MacBeth Regression:

In [None]:
# method 1
def FamaMacBeth_(formula,time_label,df,lags):
    res = df.groupby(time_label,sort=True).apply(lambda x:smf.ols(formula,data=x).fit())
    p=pd.DataFrame([x.params for x in res],index=res.index)
    N=np.mean([x.nobs for x in res])
    r2=np.mean([x.rsquared for x in res])
    means={}
    for x in p.columns:
        if lags==0:
            means[x]=smf.ols(formula=x+'~1',
                            data=p[[x]]).fit(use_t=True)
        else:
            means[x]=smf.ols(formula=x+'~1',
                            data=p[[x]]).fit(cov_type='HAC',
                                             cov_kwds={'maxlags':lags},
                                             use_t=True)
    result=pd.DataFrame([
        [means[x].params['Intercept'],means[x].bse['Intercept'],
        means[x].tvalues['Intercept'],means[x].pvalues['Intercept']]
        for x in p.columns],index=p.columns, columns=['coef','stderr','tvalue','pvalue'])
    return result,r2

In [None]:
FamaMacBeth_('Return~0+LogSize+LogBM+Return_2_12','datadate',charac_1,lags=16)

In [None]:
# method 2
result = fama_macbeth(charac_1,'datadate','Return',['LogSize','LogBM','Return_2_12'],intercept=False)
result.head()

In [None]:
fm_summary(result, pvalues=True)

<b>Appropriate use of Econometrics/Statistics Methods</b>

To explain the linear algebra in our model 1 with constant value 

$$ X \equiv \begin{bmatrix} 1,x_1 \\1, x_2 \\ ... \\1, x_T \end{bmatrix}$$
$$ \hat{\beta} = (X'X)^{-1}X'y \equiv My$$

In [None]:
res = smf.ols('Return~LogSize+LogBM+Return_2_12', charac_1).fit(missing = 'drop')
res.summary()

In [None]:
# Linear algebra 
mkt = charac_1[['LogSize','LogBM','Return_2_12']]
x = np.c_[np.ones(len(mkt)),mkt]
y = charac_1[['Return']]
M = np.linalg.inv(x.T @ x) @ x.T 
print(M @ y)


# The outputs of slopes are same. 

3. Estimate the models under Fama-MacBeth regressions and get the Table 2 (True method):

- Model 1

In [None]:
csv_writer.writerow([' ','Slope','t-stat','R^2','Slope','t-stat','R^2','Slope','t-stat','R^2'])
csv_writer.writerow(['Model 1:','Three predictors'])

In [None]:
fmdata = charac_1.set_index(['Ticker_code','datadate'])
fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12']])
res_fm = fm.fit(debiased=False)
res_fm

In [None]:
# Use Newey West method to adjust
T = charac_1.shape[0]
L = int(np.ceil(4 * (T / 100) ** (2 / 9)))
L

In [None]:
# Here bandwidth is the number of lags, defaulted to the optimal. cov_type= 'kernel' means Newey West adjustment
res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
res_fm

In [None]:
def get_model1(charac):
    fmdata = charac.set_index(['Ticker_code','datadate'])
    fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12']])
    res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
    return np.array([round(res_fm.params[0:3][0], 2),round(res_fm.tstats[0:3][0], 2),round(res_fm.rsquared, 3),
                     round(res_fm.params[0:3][1], 2),round(res_fm.tstats[0:3][1], 2),
                     round(res_fm.params[0:3][2], 2),round(res_fm.tstats[0:3][2], 2)])

In [None]:
csv_writer.writerow(['LogSize',get_model1(charac_1)[0],get_model1(charac_1)[1],get_model1(charac_1)[2],
                     get_model1(Abt_1)[0],get_model1(Abt_1)[1],get_model1(Abt_1)[2],
                     get_model1(L_1)[0],get_model1(L_1)[1],get_model1(L_1)[2]])
csv_writer.writerow(['LogB/M',get_model1(charac_1)[3],get_model1(charac_1)[4],' ',get_model1(Abt_1)[3],get_model1(Abt_1)[4],' ',
                     get_model1(L_1)[3],get_model1(L_1)[4],' '])
csv_writer.writerow(['Return_2_12',get_model1(charac_1)[5],get_model1(charac_1)[6],' ',get_model1(Abt_1)[5],get_model1(Abt_1)[6],' ',
                     get_model1(L_1)[5],get_model1(L_1)[6],' '])

In [None]:
csv_writer.writerow(['N',' ',charac_1.shape[0],' ',' ',Abt_1.shape[0],' ',' ',L_1.shape[0]])

- Model 2

In [None]:
csv_writer.writerow(['Model 2:',' Seven predictors'])

In [None]:
def get_model2(charac):
    fmdata = charac.set_index(['Ticker_code','datadate'])
    fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG']])
    res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
    return np.array([round(res_fm.params[0:7][0], 2),round(res_fm.tstats[0:7][0], 2),round(res_fm.rsquared, 3),
                     round(res_fm.params[0:7][1], 2),round(res_fm.tstats[0:7][1], 2),
                     round(res_fm.params[0:7][2], 2),round(res_fm.tstats[0:7][2], 2),
                     round(res_fm.params[0:7][3], 2),round(res_fm.tstats[0:7][3], 2),
                     round(res_fm.params[0:7][4], 2),round(res_fm.tstats[0:7][4], 2),
                     round(res_fm.params[0:7][5], 2),round(res_fm.tstats[0:7][5], 2),
                     round(res_fm.params[0:7][6], 2),round(res_fm.tstats[0:7][6], 2)])
                               

In [None]:
csv_writer.writerow(['LogSize',get_model2(charac_2)[0],get_model2(charac_2)[1],get_model2(charac_2)[2],
                     get_model2(Abt_2)[0],get_model2(Abt_2)[1],get_model2(Abt_2)[2],
                     get_model2(L_2)[0],get_model2(L_2)[1],get_model2(L_2)[2]])
csv_writer.writerow(['LogB/M',get_model2(charac_2)[3],get_model2(charac_2)[4],' ', get_model2(Abt_2)[3],get_model2(Abt_2)[4],' ',
                     get_model2(L_2)[3],get_model2(L_2)[4],' '])
csv_writer.writerow(['Return_2_12',get_model2(charac_2)[5],get_model2(charac_2)[6],' ',get_model2(Abt_2)[5],get_model2(Abt_2)[6],' ',
                     get_model2(L_2)[5],get_model2(L_2)[6],' '])
csv_writer.writerow(['LogIssues_1_36',get_model2(charac_2)[7],get_model2(charac_2)[8],' ',get_model2(Abt_2)[7],get_model2(Abt_2)[8],' ',
                     get_model2(L_2)[7],get_model2(L_2)[8],' '])
csv_writer.writerow(['Accruals',get_model2(charac_2)[9],get_model2(charac_2)[10],' ',get_model2(Abt_2)[9],get_model2(Abt_2)[10],' ',
                     get_model2(L_2)[9],get_model2(L_2)[10],' '])
csv_writer.writerow(['ROA',get_model2(charac_2)[11],get_model2(charac_2)[12],' ',get_model2(Abt_2)[11],get_model2(Abt_2)[12],' ',
                     get_model2(L_2)[11],get_model2(L_2)[12],' '])
csv_writer.writerow(['LogAG',get_model2(charac_2)[12],get_model2(charac_2)[13],' ',get_model2(Abt_2)[12],get_model2(Abt_2)[13],' ',
                     get_model2(L_2)[12],get_model2(L_2)[13],' '])

In [None]:
csv_writer.writerow(['N',' ',charac_2.shape[0],' ',' ',Abt_2.shape[0],' ',' ',L_2.shape[0]])

- Model 3

In [None]:
csv_writer.writerow(['Model 3:',' Thirteen predictors'])

In [None]:
def get_model3(charac):
    fmdata = charac.set_index(['Ticker_code','datadate'])
    fm = FamaMacBeth(dependent = fmdata['Return'],
                 exog = fmdata[['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG','DY',\
                'Return_13_36','LogIssues_1y','Turnover','Debtprice','Salesprice']])
    res_fm = fm.fit(cov_type= 'kernel',debiased = False, bandwidth = 4)
    return np.array([round(res_fm.params[0:13][0], 2),round(res_fm.tstats[0:13][0], 2),round(res_fm.rsquared, 3),
                     round(res_fm.params[0:13][1], 2),round(res_fm.tstats[0:13][1], 2),
                     round(res_fm.params[0:13][2], 2),round(res_fm.tstats[0:13][2], 2),
                     round(res_fm.params[0:13][3], 2),round(res_fm.tstats[0:13][3], 2),
                     round(res_fm.params[0:13][4], 2),round(res_fm.tstats[0:13][4], 2),
                     round(res_fm.params[0:13][5], 2),round(res_fm.tstats[0:13][5], 2),
                     round(res_fm.params[0:13][6], 2),round(res_fm.tstats[0:13][6], 2),
                     round(res_fm.params[0:13][7], 2),round(res_fm.tstats[0:13][7], 2),
                     round(res_fm.params[0:13][8], 2),round(res_fm.tstats[0:13][8], 2),
                     round(res_fm.params[0:13][9], 2),round(res_fm.tstats[0:13][9], 2),
                     round(res_fm.params[0:13][10], 2),round(res_fm.tstats[0:13][10], 2),
                     round(res_fm.params[0:13][11], 2),round(res_fm.tstats[0:13][11], 2),
                     round(res_fm.params[0:13][12], 2),round(res_fm.tstats[0:13][12], 2)])
                   

In [None]:
csv_writer.writerow(['LogSize',get_model3(charac_3)[0],get_model3(charac_3)[1],get_model3(charac_3)[2],
                     get_model3(Abt_3)[0],get_model3(Abt_3)[1],get_model3(Abt_3)[2],
                     get_model3(L_3)[0],get_model3(L_3)[1],get_model3(L_3)[2]])
csv_writer.writerow(['LogB/M',get_model3(charac_3)[3],get_model3(charac_3)[4],' ',get_model3(Abt_3)[3],get_model3(Abt_3)[4],' ',
                     get_model3(L_3)[3],get_model3(L_3)[4],' '])
csv_writer.writerow(['Return_2_12',get_model3(charac_3)[5],get_model3(charac_3)[6],' ',get_model3(Abt_3)[5],get_model3(Abt_3)[6],' ',
                     get_model3(L_3)[5],get_model3(L_3)[6],' '])
csv_writer.writerow(['LogIssues_1_36',get_model3(charac_3)[7],get_model3(charac_3)[8],' ',get_model3(Abt_3)[7],get_model3(Abt_3)[8],' ',
                     get_model3(L_3)[7],get_model3(L_3)[8],' '])
csv_writer.writerow(['Accruals',get_model3(charac_3)[9],get_model3(charac_3)[10],' ',get_model3(Abt_3)[9],get_model3(Abt_3)[10],' ',
                     get_model3(L_3)[9],get_model3(L_3)[10],' '])
csv_writer.writerow(['ROA',get_model3(charac_3)[11],get_model3(charac_3)[12],' ',get_model3(Abt_3)[11],get_model3(Abt_3)[12],' ',
                     get_model3(L_3)[11],get_model3(L_3)[12],' '])
csv_writer.writerow(['LogAG',get_model3(charac_3)[13],get_model3(charac_3)[14],' ',get_model3(Abt_3)[13],get_model3(Abt_3)[14],' ',
                     get_model3(L_3)[13],get_model3(L_3)[14],' '])
csv_writer.writerow(['DY',get_model3(charac_3)[15],get_model3(charac_3)[16],' ',get_model3(Abt_3)[15],get_model3(Abt_3)[16],' ',
                     get_model3(L_3)[15],get_model3(L_3)[16],' '])
csv_writer.writerow(['LogReturn_13_36',get_model3(charac_3)[17],get_model3(charac_3)[18],' ',get_model3(Abt_3)[17],get_model3(Abt_3)[18],' ',
                     get_model3(L_3)[17],get_model3(L_3)[18],' '])
csv_writer.writerow(['LogIssues_1_12',get_model3(charac_3)[19],get_model3(charac_3)[20],' ',get_model3(Abt_3)[19],get_model3(Abt_3)[20],' ',
                     get_model3(L_3)[19],get_model3(L_3)[20],' '])
csv_writer.writerow(['Turnover_1_12',get_model3(charac_3)[21],get_model3(charac_3)[22],' ',get_model3(Abt_3)[21],get_model3(Abt_3)[22],' ',
                     get_model3(L_3)[21],get_model3(L_3)[22],' '])
csv_writer.writerow(['Debt/price',get_model3(charac_3)[23],get_model3(charac_3)[24],' ',get_model3(Abt_3)[23],get_model3(Abt_3)[24],' ',
                     get_model3(L_3)[23],get_model3(L_3)[24],' '])
csv_writer.writerow(['Sales/price',get_model3(charac_3)[25],get_model3(charac_3)[26],' ',get_model3(Abt_3)[25],get_model3(Abt_3)[26],' ',
                     get_model3(L_3)[25],get_model3(L_3)[26],' '])

In [None]:
csv_writer.writerow(['N',' ',charac_3.shape[0],' ',' ',Abt_3.shape[0],' ',' ',L_3.shape[0]])

In [None]:
file.close()

4.Show the results

In [None]:
Table_2 = pd.read_csv('table 2.csv', keep_default_na=False)
Table_2

## Implementing the Fama-French three-factor model under Fama-MacBeth regressions

1. Import the libraries:

In [None]:
from pandas_datareader.famafrench import get_available_datasets
import pandas_datareader.data as web

2. Print available datasets (here only first 5):

In [None]:
get_available_datasets()[:5]

3. Download the selected dataset:

In [None]:
ff_dict = web.DataReader('F-F_Research_Data_Factors', 'famafrench', 
                         start='1964-05-01')

In [None]:
ff_dict.keys()

4. Inspect the description of the dataset

In [None]:
print(ff_dict['DESCR'])

5. View the monthly dataset:

In [None]:
factor_df = ff_dict[0]
factor_df = factor_df.apply(pd.to_numeric, 
                            errors='coerce') \
                     .div(100)
factor_df.head()

In [None]:
factor_df.iloc[:,0] = factor_df.iloc[:,0]+factor_df['RF']
factor_df.rename(columns={'Mkt-RF': 'MKT'}, inplace=True) 
factor_df.reset_index(inplace=True)
factor_df = factor_df.loc[(factor_df['Date'] >= '2010-1') & (factor_df['Date'] <= '2019-12')]
factor_df

6. Merger the dataset:

In [None]:
rtn = charac_3.pivot_table(index = 'datadate', 
                                columns = 'Ticker_x', 
                                values= 'Return')
rtn.dropna(axis='columns', thresh=120, inplace=True)
rtn

In [None]:
# Set specific firms
aim_stocks = np.array(rtn.columns)
# Set specific time period
filtered_charac = charac_1[charac_1['Ticker_x'].isin(aim_stocks)] #.sort_values(['Ticker_x','datadate'])
filtered_charac = filtered_charac.drop_duplicates(subset=['LogSize','Return'],keep='last')
filtered_charac = filtered_charac.reset_index(drop=True).reset_index()
filtered_charac

In [None]:
factors = pd.DataFrame()
for i in range(158):
    a = factor_df
    d = pd.DataFrame(a)
    factors = factors.append([d]) 
factors = factors.reset_index(drop=True).reset_index()
factors

In [None]:
FF3 = pd.merge(filtered_charac,factors,on=['index'])
Abt = FF3[FF3['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))]
L = FF3[FF3['Ticker_x'].isin(np.array(large_cap['Large stocks']))]

7. Estimate the three-factor model under Fama-MacBeth regressions:

In [None]:
def get_Fama_French_3(charac):
    coeffs = []
    result = fama_macbeth(charac,'datadate','Return',['MKT','SMB','HML'])
    params = fm_summary(result, pvalues=True) 
    return params

In [None]:
print('All stocks')
get_Fama_French_3(FF3)

In [None]:
print('All-but-tiny stocks')
get_Fama_French_3(Abt)

In [None]:
print('Large stocks')
get_Fama_French_3(L)

## Rolling n-factor model

In [None]:
filtered_charac_1 = charac_1[charac_1['Ticker_x'].isin(aim_stocks)].sort_values(by='datadate')
filtered_charac_2 = charac_2[charac_2['Ticker_x'].isin(aim_stocks)].sort_values(by='datadate')
filtered_charac_3 = charac_3[charac_3['Ticker_x'].isin(aim_stocks)].sort_values(by='datadate')
Abt_1_ = filtered_charac_1[filtered_charac_1['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))].sort_values(by='datadate')
L_1_ = filtered_charac_1[charac_1['Ticker_x'].isin(np.array(large_cap['Large stocks']))].sort_values(by='datadate')
Abt_2_ = filtered_charac_2[filtered_charac_2['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))].sort_values(by='datadate')
L_2_ = filtered_charac_2[filtered_charac_2['Ticker_x'].isin(np.array(large_cap['Large stocks']))].sort_values(by='datadate')
Abt_3_ = filtered_charac_3[filtered_charac_3['Ticker_x'].isin(np.array(All_but_tiny_cap['All-but-tiny stocks']))].sort_values(by='datadate')
L_3_ = filtered_charac_3[filtered_charac_3['Ticker_x'].isin(np.array(large_cap['Large stocks']))].sort_values(by='datadate')

In [None]:
# formula1 = 'Return~0+LogSize+LogBM+Return_2_12'
# formula2 = 'Return~0+LogSize+LogBM+Return_2_12+LogIssues+Accruals+ROA+LogAG'
# formula3 = 'Return~0+LogSize+LogBM+Return_2_12+LogIssues+Accruals+ROA+LogAG+DY+Return_13_36+LogIssues_1y+Turnover+Debtprice+Salesprice'
MODEL1_VARIABLE = ['LogSize','LogBM','Return_2_12']
MODEL2_VARIABLE = ['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG']
MODEL3_VARIABLE = ['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG','DY',\
                'Return_13_36','LogIssues_1y','Turnover','Debtprice','Salesprice']

In [None]:
def get_model1_rolling_data(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
  
    for time in range(96): 
        start_index = time*N
        end_index = start_index + window_size*N
        data = input_data.iloc[start_index:end_index]
  

        # define and fit the regression model 
        
#         fm = FamaMacBeth(dependent = fmdata['Return'],exog = fmdata[['LogSize','LogBM','Return_2_12']])
#         ff_model = fm.fit(cov_type= 'kernel',debiased = False)#  , bandwidth = 4
#         params = FamaMacBeth_(VARIABLE,data,lags=4)[0].iloc[:,0]
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [None]:
def get_model2_rolling_data(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)

    for time in range(96): 
        start_index = time*N
        end_index = start_index + window_size*N
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG'],intercept=False)
        params = fm_summary(result).iloc[:,0] 
      
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [None]:
def get_model3_rolling_data(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)

    for time in range(96): 
        start_index = time*N
        end_index = start_index + window_size*N
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG','DY',\
                'Return_13_36','LogIssues_1y','Turnover','Debtprice','Salesprice'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [None]:
all_roll_1 = get_model1_rolling_data(filtered_charac_1,24)
all_roll_2 = get_model2_rolling_data(filtered_charac_2,24)
all_roll_3 = get_model3_rolling_data(filtered_charac_3,24)
abt_roll_1 = get_model1_rolling_data(Abt_1_,24)
abt_roll_2 = get_model2_rolling_data(Abt_2_,24)
abt_roll_3 = get_model3_rolling_data(Abt_3_,24)
L_roll_1 = get_model1_rolling_data(L_1_,24)
L_roll_2 = get_model2_rolling_data(L_2_,24)
L_roll_3 = get_model3_rolling_data(L_3_,24)

In [None]:
def get_rolling_return1(row,result):
    
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL1_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [None]:
def get_rolling_return2(row,result):
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL2_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [None]:
def get_rolling_return3(row,result):
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL3_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [None]:
all_roll_1_R = get_rolling_return1(filtered_charac_1,all_roll_1)
all_roll_2_R = get_rolling_return2(filtered_charac_2,all_roll_2)
all_roll_3_R = get_rolling_return3(filtered_charac_3,all_roll_3)
abt_roll_1_R = get_rolling_return1(Abt_1_,abt_roll_1)
abt_roll_2_R = get_rolling_return2(Abt_2_,abt_roll_2)
abt_roll_3_R = get_rolling_return3(Abt_3_,abt_roll_3)
L_roll_1_R = get_rolling_return1(L_1_,L_roll_1)
L_roll_2_R = get_rolling_return2(L_2_,L_roll_2)
L_roll_3_R = get_rolling_return3(L_3_,L_roll_3)

In [None]:
all_roll_1_R

## Cumulative n-factor model

In [None]:
def get_cumulative_data1(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
    start_index = 0
  
    for time in range(96): 

        end_index = window_size*N+N*time
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [None]:
def get_cumulative_data2(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
    start_index = 0
  
    for time in range(96): 

        end_index = window_size*N+N*time
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 

        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [None]:
def get_cumulative_data3(input_data,window_size):
    
    coeffs = []
    N = int(input_data.shape[0]/120)
    start_index = 0
  
    for time in range(96): 

        end_index = window_size*N+N*time
        data = input_data.iloc[start_index:end_index]

        # define and fit the regression model 
        result = fama_macbeth(data,'datadate','Return',['LogSize','LogBM','Return_2_12','LogIssues','Accruals','ROA','LogAG','DY',\
                'Return_13_36','LogIssues_1y','Turnover','Debtprice','Salesprice'],intercept=False)
        params = fm_summary(result).iloc[:,0] 

        
        # store coefficients
        coeffs.append(params)
    
    coeffs_df = pd.DataFrame(coeffs)
    #index=input_data.datadate[window_size - 1:]index=input_data.index[window_size*N - 1:]

    return coeffs_df

In [None]:
def get_cumulative_return1(row,window_size):
    result = get_cumulative_data1(row,window_size)
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL1_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [None]:
def get_cumulative_return2(row,window_size):
    result = get_cumulative_data2(row,window_size)
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL2_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [None]:
def get_cumulative_return3(row,window_size):
    result = get_cumulative_data3(row,window_size)
    data = row.groupby('datadate').agg([np.mean]).iloc[24:]
    Return = []
    agg = 0
    for c in range(len(MODEL3_VARIABLE)):
        one = np.multiply(np.array(data.iloc[:,c+1]),np.array(result.iloc[:,c]))
        agg += one
    Return.append(agg)
    df = pd.DataFrame(Return).T
    df = df.rename(columns={0:'expected_return'})
    return df

In [None]:
all_cumu_1_R = get_cumulative_return1(filtered_charac_1,24)
all_cumu_2_R = get_cumulative_return2(filtered_charac_2,24)
all_cumu_3_R = get_cumulative_return3(filtered_charac_3,24)
abt_cumu_1_R = get_cumulative_return1(Abt_1_,24)
abt_cumu_2_R = get_cumulative_return2(Abt_2_,24)
abt_cumu_3_R = get_cumulative_return3(Abt_3_,24)
L_cumu_1_R = get_cumulative_return1(L_1_,24)
L_cumu_2_R = get_cumulative_return2(L_2_,24)
L_cumu_3_R = get_cumulative_return3(L_3_,24)

In [None]:
L_cumu_3_R

## Predictive ability analysis

In [None]:
def get_Predictive_ability(row_return,expected_return_estimates,window):
    Predictive = pd.DataFrame()
    data = row_return[['datadate','Return']].groupby('datadate').agg([np.mean]).iloc[24:]
    data = data.reset_index(drop=True).reset_index()
    expected_return_estimates = expected_return_estimates.reset_index(drop=True).reset_index()
    fmdata = pd.merge(data,expected_return_estimates,on=['index'])
    fmdata = fmdata.rename(columns={('Return', 'mean'):'Return'})

    res_fm = smf.ols('Return~0+expected_return', fmdata).fit(missing = 'drop')
    Predictive['Slope'] = res_fm.params.round(2)
    Predictive['S.E.'] = res_fm.bse.round(2)
    Predictive['t-stat'] = res_fm.tvalues.round(2)
    Predictive['R^2'] = res_fm.rsquared.round(3)
    #Predictive.rename(index={'return':row_return.name},inplace=True)
    
    return Predictive

In [None]:
Predictive_ability_rolling = pd.concat([get_Predictive_ability(filtered_charac_1,all_roll_1_R,24),
                                        get_Predictive_ability(filtered_charac_2,all_roll_2_R,24),
                                        get_Predictive_ability(filtered_charac_3,all_roll_3_R,24),
                                        get_Predictive_ability(Abt_1_,abt_roll_1_R,24),
                                        get_Predictive_ability(Abt_2_,abt_roll_2_R,24),
                                        get_Predictive_ability(Abt_3_,abt_roll_3_R,24),
                                        get_Predictive_ability(L_1_,L_roll_1_R,24),
                                        get_Predictive_ability(L_2_,L_roll_2_R,24),
                                        get_Predictive_ability(L_3_,L_roll_3_R,24)])

In [None]:
Predictive_ability_cumulative = pd.concat([get_Predictive_ability(filtered_charac_1,all_cumu_1_R,24),
                                        get_Predictive_ability(filtered_charac_2,all_cumu_2_R,24),
                                        get_Predictive_ability(filtered_charac_3,all_cumu_3_R,24),
                                        get_Predictive_ability(Abt_1_,abt_cumu_1_R,24),
                                        get_Predictive_ability(Abt_2_,abt_cumu_2_R,24),
                                        get_Predictive_ability(Abt_3_,abt_cumu_3_R,24),
                                        get_Predictive_ability(L_1_,L_cumu_1_R,24),
                                        get_Predictive_ability(L_2_,L_cumu_2_R,24),
                                        get_Predictive_ability(L_3_,L_cumu_3_R,24)])

## Write the outcomes to table 3

In [None]:
properties_1 = pd.DataFrame()
properties_1['FM estimate'] = np.array(['All stocks','All stocks','All stocks','All-but-tiny stocks','All-but-tiny stocks',
                                   'All-but-tiny stocks','Large stocks','Large stocks','Large stocks'])
properties_1['Model'] = np.array(['Model 1','Model 2','Model 3','Mode 1','Mode 2','Mode 3','Model 1','Model 2','Model 3'])


properties_1['Avg'] = np.zeros(9)
properties_1['Std'] = np.zeros(9)
properties_1['p10'] = np.zeros(9)
properties_1['p90'] = np.zeros(9)
properties_1['slopes'] = np.array(['Rolling','Rolling','Rolling','Rolling','Rolling','Rolling','Rolling','Rolling','Rolling'])

for i,v in enumerate([all_roll_1_R,all_roll_2_R,all_roll_3_R,abt_roll_1_R,abt_roll_2_R,abt_roll_3_R,L_roll_1_R,L_roll_2_R,L_roll_3_R]):
    properties_1['Avg'][i] = v['expected_return'].mean().round(2)
    properties_1['Std'][i] = v['expected_return'].std().round(2)
    properties_1['p10'][i] = np.nanpercentile(v['expected_return'], 10).round(2)
    properties_1['p90'][i] = np.nanpercentile(v['expected_return'], 90).round(2)

In [None]:
properties_2 = pd.DataFrame()
properties_2['FM estimate'] = np.array(['All stocks','All stocks','All stocks','All-but-tiny stocks','All-but-tiny stocks',
                                   'All-but-tiny stocks','Large stocks','Large stocks','Large stocks'])
properties_2['Model'] = np.array(['Model 1','Model 2','Model 3','Mode 1','Mode 2','Mode 3','Model 1','Model 2','Model 3'])


properties_2['Avg'] = np.zeros(9)
properties_2['Std'] = np.zeros(9)
properties_2['p10'] = np.zeros(9)
properties_2['p90'] = np.zeros(9)
properties_2['slopes'] = np.array(['Cumulative','Cumulative','Cumulative','Cumulative','Cumulative','Cumulative',
                                   'Cumulative','Cumulative','Cumulative'])

for i,v in enumerate([all_cumu_1_R,all_cumu_2_R,all_cumu_3_R,abt_cumu_1_R,abt_cumu_2_R,abt_cumu_3_R,L_cumu_1_R,L_cumu_2_R,L_cumu_3_R]):
    properties_2['Avg'][i] = v['expected_return'].mean().round(2)
    properties_2['Std'][i] = v['expected_return'].std().round(2)
    properties_2['p10'][i] = np.nanpercentile(v['expected_return'], 10).round(2)
    properties_2['p90'][i] = np.nanpercentile(v['expected_return'], 90).round(2)

In [None]:
properties = pd.concat([properties_1,properties_2])
properties['index'] = ['12','13','14','18','19','20','24','25','26','15','16','17','21','22','23','27','28','29']
properties

In [None]:
predictive = pd.concat([Predictive_ability_rolling,Predictive_ability_cumulative])
predictive['index'] = ['12','13','14','18','19','20','24','25','26','15','16','17','21','22','23','27','28','29']
predictive

In [None]:
table_3 = pd.merge(properties,predictive,on=['index'])
table_3.sort_values(by = 'index',inplace=True)
table_3.drop(columns=['index'],inplace=True)
table_3.set_index(['FM estimate','slopes'],inplace=True)
table_3

In [None]:
# save it
table_3.to_csv('table 3.csv')